Following this question and this one, I wondered what was the best option to summarise categorical variables in one dataset.
I have a dataset such as
# A tibble: 10 <U+00D7> 4
empstat_couple nssec7_couple3 nchild07 age_couple
<chr> <fctr> <fctr> <dbl>
1 Neo-Trad Lower Managerial 1child 39
2 Neo-Trad Higher Managerial 1child 31
3 Neo-Trad Manual and Routine 1child 33
4 Trad Higher Managerial 1child 43
The 3 first variables are categorical (character or factor) and the last numerical.
What I would like is something like (output)
var n p
1: Neo-Trad 6 0.6
2: OtherArrangment 2 0.2
3: Trad 2 0.2
4: Higher Managerial 4 0.4
5: Lower Managerial 5 0.5
6: Manual and Routine 1 0.1
7: 1child 9 0.9
8: 2children 1 0.1
Well for the numerical variable, I am unsure how to add it meaningfully to the summary.
I guess the most basic way to go is
library(dplyr)
library(data.table)
a = count(dt, empstat_couple) %>% mutate(p = n / sum(n))
b = count(dt, nssec7_couple3) %>% mutate(p = n / sum(n))
c = count(dt, nchild07) %>% mutate(p = n / sum(n))
rbindlist(list(a,b,c))
I wondered if a summarise_each solution existed ?
This doesn't work
dt %>% summarise_each(funs(count))
Using apply I could come up with this
apply(dt, 2, as.data.frame(table)) %>% rbindlist()
But it's not great.
Any suggestions ?
data
dt = structure(list(empstat_couple = c("Neo-Trad", "Neo-Trad", "Neo-Trad",
"Trad", "OtherArrangment", "Neo-Trad", "Trad", "OtherArrangment",
"Neo-Trad", "Neo-Trad"), nssec7_couple3 = structure(c(2L, 1L,
4L, 1L, 2L, 2L, 1L, 2L, 1L, 2L), .Label = c("Higher Managerial",
"Lower Managerial", "Intermediate", "Manual and Routine"), class = "factor"),
nchild07 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L), .Label = c("1child", "2children", ">2children"), class = "factor"),
age_couple = c(39, 31, 33, 43, 32, 28, 28, 40, 33, 26), hldid = 1:10), .Names = c("empstat_couple",
"nssec7_couple3", "nchild07", "age_couple", "hldid"), row.names = c(NA,
-10L), class = "data.frame")
We can melt with data.table and get the .N and proportion
library(data.table)
unique(melt(setDT(dt), id.var = "age_couple")[, n := .N , value],
by = c("variable", "value", "n"))[, p := n/sum(n), variable
][, c("age_couple", "variable" ) := NULL][]
Or using dplyr/tidyr
library(dplyr)
library(tidyr)
gather(dt, var1, var, -age_couple) %>%
group_by(var) %>%
mutate(n = n()) %>%
select(-age_couple) %>%
unique() %>%
group_by(var1) %>%
mutate(p= n/sum(n)) %>%
ungroup() %>%
select(-var1)
Related
My df, Chap3, has ~50 categorical variables. I want to produce a frequency table for each categorical variable that also includes percentages. The code below works fine for the single var bsex but I cannot figure out how to repeat it for all categorical vars. Have tried using variants of apply, using select_if(is.factor), etc, to no avail.
Chap3 %>%
count(bsex) %>%
mutate(percent = round(n / sum(n) * 100,1))
For such cases it is better if you get the categorical data in long format.
library(dplyr)
library(tidyr)
Chap3 %>%
pivot_longer(cols = where(is.factor)) %>%
count(name, value) %>%
group_by(name) %>%
mutate(n = round(prop.table(n), 1)) %>%
ungroup
# name value n
# <chr> <fct> <dbl>
#1 bsex 0 0.4
#2 bsex 1 0.6
#3 csex 0 0.5
#4 csex 1 0.5
data
It is easier to help if you provide data in a reproducible format
set.seed(123)
Chap3 <- data.frame(id = 1:10,
bsex = factor(sample(c(1, 0), 10, replace = TRUE)),
csex = factor(sample(c(1, 0), 10, replace = TRUE)))
We may use table/proportions from base R
proportions(table(stack(type.convert(Chap3[-1], as.is = TRUE))), 2)
ind
values bsex csex
0 0.4 0.5
1 0.6 0.5
data
Chap3 <- structure(list(id = 1:10, bsex = structure(c(2L, 2L, 2L, 1L,
2L, 1L, 1L, 1L, 2L, 2L), .Label = c("0", "1"), class = "factor"),
csex = structure(c(1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L
), .Label = c("0", "1"), class = "factor")), class = "data.frame", row.names = c(NA,
-10L))
This question is inspired by this and this question.
I am trying to calculate the proportion of different values within each group, but I do not want to create "new" rows for the groups but new columns.
Taking the example from the second question above. If I have the following data:
data <- structure(list(value = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L), class = structure(c(1L, 1L, 1L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L), .Label = c("A",
"B"), class = "factor")), .Names = c("value", "class"), class = "data.frame", row.names = c(NA,
-16L))
I can calculate the proportion of each value (1,2,3) in each class (A,B):
data %>%
group_by(value, class) %>%
summarise(n = n()) %>%
complete(class, fill = list(n = 0)) %>%
group_by(class) %>%
mutate(freq = n / sum(n))
# A tibble: 6 x 4
value class n freq
<int> <fctr> <dbl> <dbl>
1 1 A 3 0.2727273
2 1 B 3 0.6000000
3 2 A 4 0.3636364
4 2 B 2 0.4000000
5 3 A 4 0.3636364
6 3 B 0 0.0000000
However I end up with a line for each value/class pair instead I want something like this:
# some code
# A tibble: 6 x 4
class n 1 2 3
<fctr> <dbl> <dbl> <dbl> <dbl>
1 A 11 0.2727273 0.3636364 0.3636364
2 B 5 0.6000000 0.4000000 0.0000000
With a column for each group. I could write for loops to construct a new data frame from the old one but I am certain there is a better way. Any suggestions?
Thank you
We can use pivot_wider at the end
library(dplyr)
library(tidyr)
data %>%
group_by(value, class) %>%
summarise(n = n()) %>%
complete(class, fill = list(n = 0)) %>%
group_by(class) %>%
mutate(freq = n / sum(n), n = sum(n)) %>%
pivot_wider(names_from = value, values_from = freq)
# A tibble: 2 x 5
# Groups: class [2]
# class n `1` `2` `3`
# <fct> <dbl> <dbl> <dbl> <dbl>
#1 A 11 0.273 0.364 0.364
#2 B 5 0.6 0.4 0
Or as #IcecreamToucan mentioned, the complete is not needed as the pivot_wider have the option to fill with a custom value (default is NA)
data %>%
group_by(value, class) %>%
summarise(n = n()) %>%
group_by(class) %>%
mutate(freq = n / sum(n), n = sum(n)) %>%
pivot_wider(names_from = value, values_from = freq, values_fill = list(freq = 0))
If we are using a previous version of tidyr, then use spread
data %>%
group_by(value, class) %>%
summarise(n = n()) %>%
complete(class, fill = list(n = 0)) %>%
group_by(class) %>%
mutate(freq = n / sum(n), n = sum(n)) %>%
spread(value, freq)
Method using data.table::dcast instead of pivot_wider.
Line 1: Get a count (.N) for each (value, class) group, and call it n
Line 2: Make new variables within each class group:
N, the sum of the previous counts
pct, the percent of N each n makes up
Line 3: Cast to wide with class and N as the rows, value as the column names, and pct as the column elements, with empty elements set to 0.
library(magrittr) # For %>%. Not necessary if dplyr is loaded already
library(data.table)
setDT(data)
data[, .(n = .N), by = .(value, class)] %>%
.[, `:=`(N = sum(n), pct = n/sum(n)), by = class] %>%
dcast(class + N ~ value, value.var = 'pct', fill = 0)
# class N 1 2 3
# 1: A 11 0.2727273 0.3636364 0.3636364
# 2: B 5 0.6000000 0.4000000 0.0000000
We can use count to count occurrences of value and class, group_by class, calculate the frequency and get the data in wide format.
library(dplyr)
library(tidyr)
data %>%
count(value, class) %>%
group_by(class) %>%
mutate(freq = n/sum(n), n = sum(n)) %>%
pivot_wider(names_from = value, values_from = freq, values_fill = list(freq = 0))
# class n `1` `2` `3`
# <fct> <int> <dbl> <dbl> <dbl>
#1 A 11 0.273 0.364 0.364
#2 B 5 0.6 0.4 0
This question already has answers here:
Mean per group in a data.frame [duplicate]
(8 answers)
Closed 3 years ago.
I can't get the average accuracies (proportion of TRUE values) in Correct_answers columns for the groups chart type and condition.
data
structure(list(Element = structure(c(1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2", "3", "4", "5", "6"), class = "factor"), Correct_answer = structure(c(2L,
2L, 2L, 1L, 2L), .Label = c("FALSE", "TRUE"), class = "factor"),
Response_time = c(25.155, 6.74, 28.649, 16.112, 105.5906238
), Chart_type = structure(c(2L, 2L, 1L, 1L, 1L), .Label = c("Box",
"Violin"), class = "factor"), Condition = structure(c(1L,
2L, 1L, 2L, 1L), .Label = c("0", "1"), class = "factor")), row.names = c(NA,
5L), class = "data.frame")
Average by chart_type
av_data_chartType <- data %>% group_by(Chart_type) %>% summarise_each(funs(mean, sd))
Average by condition
av_data_conition <- data %>% group_by(Condition) %>% summarise_each(funs(mean, sd))
No mean produced for accuracy
NA value is place where accuracy should be.
Reproducing your code I had a warning that led me to the answer : you shouldn't compute statistics on factor variables. If you know what you are doing you can convert them to numeric :
data <- structure(list(Element = structure(c(1L, 1L, 1L, 1L, 1L),
.Label = c("1", "2", "3", "4", "5", "6"),
class = "factor"),
Correct_answer = structure(c(2L, 2L, 2L, 1L, 2L),
.Label = c("FALSE", "TRUE"),
class = "factor"),
Response_time = c(25.155, 6.74, 28.649, 16.112, 105.5906238
),
Chart_type = structure(c(2L, 2L, 1L, 1L, 1L),
.Label = c("Box",
"Violin"),
class = "factor"),
Condition = structure(c(1L, 2L, 1L, 2L, 1L),
.Label = c("0", "1"),
class = "factor")),
row.names = c(NA, 5L), class = "data.frame")
library("dplyr", warn.conflicts = FALSE)
data <- data %>% as_tibble
# av_data_chartType
data %>%
group_by(Chart_type) %>%
mutate_if(.predicate = is.factor, .funs = as.numeric) %>%
summarise_each(list( ~mean, ~sd))
#> `mutate_if()` ignored the following grouping variables:
#> Column `Chart_type`
#> # A tibble: 2 x 9
#> Chart_type Element_mean Correct_answer_~ Response_time_m~ Condition_mean
#> <fct> <dbl> <dbl> <dbl> <dbl>
#> 1 Box 1 1.67 50.1 1.33
#> 2 Violin 1 2 15.9 1.5
#> # ... with 4 more variables: Element_sd <dbl>, Correct_answer_sd <dbl>,
#> # Response_time_sd <dbl>, Condition_sd <dbl>
# av_data_condition
data %>%
group_by(Condition) %>%
mutate_if(.predicate = is.factor, .funs = as.numeric) %>%
summarise_each(list( ~mean, ~sd))
#> `mutate_if()` ignored the following grouping variables:
#> Column `Condition`
#> # A tibble: 2 x 9
#> Condition Element_mean Correct_answer_~ Response_time_m~ Chart_type_mean
#> <fct> <dbl> <dbl> <dbl> <dbl>
#> 1 0 1 2 53.1 1.33
#> 2 1 1 1.5 11.4 1.5
#> # ... with 4 more variables: Element_sd <dbl>, Correct_answer_sd <dbl>,
#> # Response_time_sd <dbl>, Chart_type_sd <dbl>
Created on 2019-06-11 by the reprex package (v0.2.1)
This should work:
a$Correct_answer <- as.logical(a$Correct_answer)
av_data_chartType <- a %>% select(Chart_type, Correct_answer) %>% group_by(Chart_type) %>% summarise_each(funs(mean, sd))
av_data_chartType <- a %>% select(Condition, Correct_answer) %>% group_by(Condition) %>% summarise_each(funs(mean, sd))
You had 2 problems:
Your Correct_answer was a factor.
You tried to calculate your functions over every Column
You probably need
library(dplyr)
data %>%
mutate(Correct_answer = as.logical(Correct_answer)) %>%
group_by(Chart_type, Condition) %>%
summarise(avg = mean(Correct_answer))
Or if you need them separately
data %>%
mutate(Correct_answer = as.logical(Correct_answer)) %>%
group_by(Chart_type) %>%
summarise(avg = mean(Correct_answer))
data %>%
mutate(Correct_answer = as.logical(Correct_answer)) %>%
group_by(Condition) %>%
summarise(avg = mean(Correct_answer))
My data is huge but I want to know the row number of similar strings
df<- structure(list(x = structure(c(5L, 5L, 5L, 5L, 1L, 1L, 3L, 5L,
5L, 6L, 6L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 3L), .Label = c("AJ5ter2",
"al-1Tter2", "AY9ter2", "CY-Yter2", "LK2ter2", "YY49ter2"), class = "factor")), class = "data.frame", row.names = c(NA,
-19L))
a desire output is shown below
LK2ter2 1:4, 9:10
AJ5ter2 5:6
AY9ter2 7, 19
YY49ter2 10:11
al-1Tter2 12:15
CY-Yter2 16:18
Another option using data.table
library(data.table)
DT <- as.data.table(df)
DT[, .(index = paste(unique(range(.I)), collapse = ":")), by = .(x, rleid(x))
][, .(index = toString(index)), by = x]
# x index
#1: LK2ter2 1:4, 8:9
#2: AJ5ter2 5:6
#3: AY9ter2 7, 19
#4: YY49ter2 10:11
#5: al-1Tter2 12:15
#6: CY-Yter2 16:18
Using tidyverse and data.table you can do:
df %>%
rowid_to_column() %>%
group_by(x, rleid(x)) %>%
summarise(res = ifelse(min(rowid) != max(rowid),
paste(min(rowid), max(rowid), sep = ":"), paste(rowid))) %>%
group_by(x) %>%
summarise(res = paste(res, collapse = ", "))
x res
<fct> <chr>
1 AJ5ter2 5:6
2 al-1Tter2 12:15
3 AY9ter2 7, 19
4 CY-Yter2 16:18
5 LK2ter2 1:4, 8:9
6 YY49ter2 10:11
Or the same with just tidyverse:
df %>%
rowid_to_column() %>%
group_by(x, x_rleid = {x_rleid = rle(as.numeric(x)); rep(seq_along(x_rleid$lengths), x_rleid$lengths)}) %>%
summarise(res = ifelse(min(rowid) != max(rowid),
paste(min(rowid), max(rowid), sep = ":"), paste(rowid))) %>%
group_by(x) %>%
summarise(res = paste(res, collapse = ", "))
Both codes, first, add a column with row ID. Second, they group by "x" and the run-length group ID of "x". Third, they assess whether the minimum row ID is equal to maximum row ID. If not, they combine the value of minimum and maximum row ID, separated by :, otherwise use just a single row ID value. Finally, they group by just "x" and combines the different elements by ,.
Or if you want all the values, not just the ranges:
df %>%
rowid_to_column() %>%
group_by(x, x_rleid = {x_rleid = rle(as.numeric(x)); rep(seq_along(x_rleid$lengths), x_rleid$lengths)}) %>%
summarise(res = paste(rowid, collapse = ",")) %>%
group_by(x) %>%
summarise(res = paste(res, collapse = ","))
x res
<fct> <chr>
1 AJ5ter2 5,6
2 al-1Tter2 12,13,14,15
3 AY9ter2 7,19
4 CY-Yter2 16,17,18
5 LK2ter2 1,2,3,4,8,9
6 YY49ter2 10,11
Here's one way with dplyr methods. Not sure if you want text output or a numeric vector
library(tidyverse)
df <- structure(list(x = structure(c(5L, 5L, 5L, 5L, 1L, 1L, 3L, 5L, 5L, 6L, 6L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 3L), .Label = c("AJ5ter2", "al-1Tter2", "AY9ter2", "CY-Yter2", "LK2ter2", "YY49ter2"), class = "factor")), class = "data.frame", row.names = c(NA, -19L))
df %>%
mutate(row_number = row_number()) %>%
group_by(x) %>%
summarise(row_nums = str_c(row_number, collapse = ","))
#> # A tibble: 6 x 2
#> x row_nums
#> <fct> <chr>
#> 1 AJ5ter2 5,6
#> 2 al-1Tter2 12,13,14,15
#> 3 AY9ter2 7,19
#> 4 CY-Yter2 16,17,18
#> 5 LK2ter2 1,2,3,4,8,9
#> 6 YY49ter2 10,11
Created on 2019-02-19 by the reprex package (v0.2.1)
You could try something like:
z <- sapply(levels(df$x), function(x) which(x == df$x))
data.frame(key = names(z), index = sapply(z, paste, collapse = ", "), row.names = NULL)
key index
1 AJ5ter2 5, 6
2 al-1Tter2 12, 13, 14, 15
3 AY9ter2 7, 19
4 CY-Yter2 16, 17, 18
5 LK2ter2 1, 2, 3, 4, 8, 9
6 YY49ter2 10, 11
Given the following dataset, I want to compute for each row the median of the columns M1,M2 and M3. I am looking for a solution where the final column is added to the dataframe under the name 'Median'. The column names (M1:M3) should not be used directly (in the original dataset, there are many more columns, not just 3).
# A tibble: 8 x 5
I1 M1 M2 I2 M3
<int> <int> <int> <int> <int>
1 3 4 5 3 5
2 2 2 2 2 1
3 2 2 2 2 2
4 3 1 3 3 1
5 2 1 3 3 1
6 3 2 4 4 3
7 3 1 3 4 1
8 2 1 3 2 3
You can load the dataset using:
df = structure(list(I1 = c(3L, 2L, 2L, 3L, 2L, 3L, 3L, 2L), M1 = c(4L,
2L, 2L, 1L, 1L, 2L, 1L, 1L), M2 = c(5L, 2L, 2L, 3L, 3L, 4L, 3L,
3L), I2 = c(3L, 2L, 2L, 3L, 3L, 4L, 4L, 2L), M3 = c(5L, 1L, 2L,
1L, 1L, 3L, 1L, 3L)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -8L), .Names = c("I1", "M1", "M2", "I2",
"M3"))
I know that several similar questions have already been asked. However, most solutions posted use rowMeans or rowSums. I'm looking for a solution where:
no 'row-function' can be used.
the solution is a simple dplyr solution
The reason for (2) is that I am teaching the 'tidyverse' to total beginners.
We could use rowMedians
library(matrixStats)
library(dplyr)
df %>%
mutate(Median = rowMedians(as.matrix(.[grep('M\\d+', names(.))])))
Or if we need to use only tidyverse functions, convert it to 'long' format with gather, summarize by row and get the median of the 'value' column
df %>%
rownames_to_column('rn') %>%
gather(key, value, starts_with('M')) %>%
group_by(rn) %>%
summarise(Median = median(value)) %>%
ungroup %>%
select(-rn) %>%
bind_cols(df, .)
Or another option is rowwise() from dplyr (hope the row is not a problem)
df %>%
rowwise() %>%
mutate(Median = median(c(!!! rlang::syms(grep('M', names(.), value=TRUE)))))
Given a dataframe df with some numeric values:
df <- structure(list(X0 = c(0.82046171427112, 0.836224720981912, 0.842547521493854,
0.848014287631906, 0.850943494153631, 0.85425398956647, 0.85616876970771,
0.856855792247478, 0.857471048654811, 0.857507363153284, 0.874487063791594,
1.70684558846347, 1.95711031206168, 6.84386713155156), X1 = c(0.755674148966666,
0.765242580861224, 0.774422478168495, 0.776953642833977, 0.778128315184819,
0.778611604461183, 0.778624581647491, 0.778454002430202, 1.52708579075974,
13.0356519295685, 18.0590093408357, 21.1371199340156, 32.4192814934364,
33.2355314147089), X2 = c(0.772236670327724, 0.788112332251601,
0.797695511542613, 0.804257521548174, 0.809815828400878, 0.816592605516508,
0.819421106011397, 0.821734473885381, 0.822561946509595, 0.822334970491528,
0.822404634095793, 2.66875340820162, 1.40412743557514, 6.33377768022403
), X3 = c(0.764363881671609, 0.788288196346034, 0.79927498357549,
0.805446784334039, 0.810604881970155, 0.814634331592811, 0.817002594424753,
0.818129844752095, 0.818572101954132, 0.818630700031836, 3.06323952591121,
6.4477868357554, 11.4657041958038, 9.27821049066848)), class = "data.frame", row.names = c(NA,
-14L))
One can easily compute row-wise median using base R like so:
df$median <- sapply(
seq(nrow(df)),
function(i) df[i, 1:4] %>% unlist %>% median
)
Above I select columns manually with numeric range, but to satisfy the dplyr requirement you can use dplyr::select() to choose your columns:
df$median <- sapply(
df %>% nrow %>% seq,
function(i) df[i, ] %>%
dplyr::select(X1, X2) %>%
unlist %>% median
)
I like this method because you don't have to search for different functions to calculate anything.
For example, standard deviation:
df$sd <- sapply(
df %>% nrow %>% seq,
function(i) df[i, ] %>%
dplyr::select(X1, X2) %>%
unlist %>% sd
)