I have a data file with numeric values in three columns and two grouping variables (ID and Group) from which I need to calculate a single max value by ID and Group:
structure(list(ID = structure(c(1L, 1L, 1L, 2L), .Label = c("a1",
"a2"), class = "factor"), Group = structure(c(1L, 1L, 2L, 2L), .Label =
c("abc",
"def"), class = "factor"), Score1 = c(10L, 0L, 0L, 5L), Score2 = c(0L,
0L, 5L, 10L), Score3 = c(0L, 11L, 2L, 11L)), class = "data.frame", row.names =
c(NA,
-4L))
The result I am trying to obtain is:
structure(list(ID = structure(c(1L, 1L, 2L), .Label = c("a1",
"a2"), class = "factor"), Group = structure(c(1L, 2L, 2L), .Label = c("abc",
"def"), class = "factor"), Max = c(11L, 5L, 11L)), class = "data.frame",
row.names = c(NA,
-3L))
I am trying the following in dplyr:
SampTable<-SampDF %>% group_by(ID,Group) %>%
summarize(max = pmax(SampDF$Score1, SampDF$Score2,SampDF$Score3))
But it generates this error:
Error in summarise_impl(.data, dots) :
Column `max` must be length 1 (a summary value), not 4
Is there an easy way to achieve this in dplyr or data.table?
Solution using data.table. Find max value on 3:5 columns (Score columns) by ID and Group.
library(data.table)
setDT(d)
d[, .(Max = do.call(max, .SD)), .SDcols = 3:5, .(ID, Group)]
ID Group Max
1: a1 abc 11
2: a1 def 5
3: a2 def 11
Data:
d <- structure(list(ID = structure(c(1L, 1L, 1L, 2L), .Label = c("a1",
"a2"), class = "factor"), Group = structure(c(1L, 1L, 2L, 2L), .Label =
c("abc",
"def"), class = "factor"), Score1 = c(10L, 0L, 0L, 5L), Score2 = c(0L,
0L, 5L, 10L), Score3 = c(0L, 11L, 2L, 11L)), class = "data.frame", row.names =
c(NA,
-4L))
A solution using tidyverse.
library(tidyverse)
dat2 <- dat1 %>%
gather(Column, Value, starts_with("Score")) %>%
group_by(ID, Group) %>%
summarise(Max = max(Value)) %>%
ungroup()
dat2
# # A tibble: 3 x 3
# ID Group Max
# <fct> <fct> <dbl>
# 1 a1 abc 11
# 2 a1 def 5
# 3 a2 def 11
Here are couple of other options with tidyverse
library(tidyverse)
df1 %>%
group_by(ID, Group) %>%
nest %>%
mutate(Max = map_dbl(data, ~ max(unlist(.x)))) %>%
select(-data)
Or using pmax
df1 %>%
mutate(Max = pmax(!!! rlang::syms(names(.)[3:5]))) %>%
group_by(ID, Group) %>%
summarise(Max = max(Max))
# A tibble: 3 x 3
# Groups: ID [?]
# ID Group Max
# <fct> <fct> <dbl>
#1 a1 abc 11
#2 a1 def 5
#3 a2 def 11
Or using base R
aggregate(cbind(Max = do.call(pmax, df1[3:5])) ~ ID + Group, df1, max)
Here is a tidyverse solution using nest :
library(tidyverse)
df %>%
nest(-(1:2),.key="Max") %>%
mutate_at("Max",map_dbl, max)
# ID Group Max
# 1 a1 abc 11
# 2 a1 def 5
# 3 a2 def 11
In base R:
res <- aggregate(. ~ ID + Group,df,max)
res <- cbind(res[1:2], Max = do.call(pmax,res[-(1:2)]))
res
# ID Group Max
# 1 a1 abc 11
# 2 a1 def 5
# 3 a2 def 11
Here is a base R solution
# gives 2x2 table
x <- by(df[, !names(df) %in% c("ID", "Group")], list(df$ID, df$Group), max)
# get requested format
tmp <- expand.grid(ID = rownames(x), Group = colnames(x))
tmp$Max <- as.vector(x)
tmp[complete.cases(tmp), ]
#R ID Group Max
#R 1 a1 abc 11
#R 3 a1 def 5
#R 4 a2 def 11
with
df <- structure(list(
ID = structure(c(1L, 1L, 1L, 2L), .Label = c("a1", "a2"), class = "factor"),
Group = structure(c(1L, 1L, 2L, 2L), .Label = c("abc", "def"), class = "factor"),
Score1 = c(10L, 0L, 0L, 5L), Score2 = c(0L, 0L, 5L, 10L),
Score3 = c(0L, 11L, 2L, 11L)),
class = "data.frame", row.names = c(NA, -4L))
Related
I am writing up my first paper. I have a data frame that has the study, symptoms, and the odds ratio that were analyzed for each symptom in each study. For example:
df <- structure(list(Study = c("Study1", "Study2", "Study1", "Study2", "Study1", "Study2"), Symptom = c("Symptom1", "Symptom1", "Symptom2", "Symptom2", "Symptom3", "Symptom3"), OR= c(1L, 0L, 1L, 0L, 1L, 0L), lower = c(-2L, -1L, -2L, -1L, -2L, -1L), upper = c(2L, 1L, 2L, 1L, 2L, 1L)), row.names = c(NA, + -6L), class = "data.frame")
I am wondering how to make a table for publication/what package to use that transforms the data and then prints a table that would look like:
df2 <- structure(list(Symptom = c("Symptom1", "Symptom2", "Symptom3"), Study1 = c("1(-2,2)", "1(-2,2)", "1(-2,2)"), Study2 = c("0(-1,1)", "0(-1,1)", "0(-1,1)")), row.names = c(NA, + -3L), class = "data.frame")
Thanks for the help!
library(dplyr)
library(tidyr)
df %>%
transmute(Study, Symptom, x = sprintf("%i(%i,%i)", OR, lower, upper)) %>%
pivot_wider(Symptom, names_from = Study, values_from = x)
# # A tibble: 3 x 3
# Symptom Study1 Study2
# <chr> <chr> <chr>
# 1 Symptom1 1(-2,2) 0(-1,1)
# 2 Symptom2 1(-2,2) 0(-1,1)
# 3 Symptom3 1(-2,2) 0(-1,1)
My df, Chap3, has ~50 categorical variables. I want to produce a frequency table for each categorical variable that also includes percentages. The code below works fine for the single var bsex but I cannot figure out how to repeat it for all categorical vars. Have tried using variants of apply, using select_if(is.factor), etc, to no avail.
Chap3 %>%
count(bsex) %>%
mutate(percent = round(n / sum(n) * 100,1))
For such cases it is better if you get the categorical data in long format.
library(dplyr)
library(tidyr)
Chap3 %>%
pivot_longer(cols = where(is.factor)) %>%
count(name, value) %>%
group_by(name) %>%
mutate(n = round(prop.table(n), 1)) %>%
ungroup
# name value n
# <chr> <fct> <dbl>
#1 bsex 0 0.4
#2 bsex 1 0.6
#3 csex 0 0.5
#4 csex 1 0.5
data
It is easier to help if you provide data in a reproducible format
set.seed(123)
Chap3 <- data.frame(id = 1:10,
bsex = factor(sample(c(1, 0), 10, replace = TRUE)),
csex = factor(sample(c(1, 0), 10, replace = TRUE)))
We may use table/proportions from base R
proportions(table(stack(type.convert(Chap3[-1], as.is = TRUE))), 2)
ind
values bsex csex
0 0.4 0.5
1 0.6 0.5
data
Chap3 <- structure(list(id = 1:10, bsex = structure(c(2L, 2L, 2L, 1L,
2L, 1L, 1L, 1L, 2L, 2L), .Label = c("0", "1"), class = "factor"),
csex = structure(c(1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L
), .Label = c("0", "1"), class = "factor")), class = "data.frame", row.names = c(NA,
-10L))
I have data where each row represents a household, and I would like to have one row per individual in the different households.
The data looks similar to this:
df <- data.frame(village = rep("aaa",5),household_ID = c(1,2,3,4,5),name_1 = c("Aldo","Giovanni","Giacomo","Pippo","Pippa"),outcome_1 = c("yes","no","yes","no","no"),name_2 = c("John","Mary","Cindy","Eva","Doron"),outcome_2 = c("yes","no","no","no","no"))
I would still like to keep the wide format of the data, just with one individual (and related outcome variables) per row. I could find examples that tell how to do the opposite, going from individual to grouped data using dcast, but I could not find examples of this problem I am facing now.
I have tried with melt
reshape2::melt(df, id.vars = "household_ID")
but I get a long format data.
Any suggestions welcome...
Thank you
Use pivot_longer() in tidyr, and set ".value" in names_to to indicate new column names from the pattern of the original column names.
library(tidyr)
df %>%
pivot_longer(-c(village, household_ID),
names_to = c(".value", "n"),
names_sep = "_")
# # A tibble: 10 x 5
# village household_ID n name outcome
# <fct> <dbl> <chr> <fct> <fct>
# 1 aaa 1 1 Aldo yes
# 2 aaa 1 2 John yes
# 3 aaa 2 1 Giovanni no
# 4 aaa 2 2 Mary no
# 5 aaa 3 1 Giacomo yes
# 6 aaa 3 2 Cindy no
# 7 aaa 4 1 Pippo no
# 8 aaa 4 2 Eva no
# 9 aaa 5 1 Pippa no
# 10 aaa 5 2 Doron no
Data
df <- structure(list(village = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "aaa", class = "factor"),
household_ID = c(1, 2, 3, 4, 5), name_1 = structure(c(1L,
3L, 2L, 5L, 4L), .Label = c("Aldo", "Giacomo", "Giovanni",
"Pippa", "Pippo"), class = "factor"), outcome_1 = structure(c(2L,
1L, 2L, 1L, 1L), .Label = c("no", "yes"), class = "factor"),
name_2 = structure(c(4L, 5L, 1L, 3L, 2L), .Label = c("Cindy",
"Doron", "Eva", "John", "Mary"), class = "factor"), outcome_2 = structure(c(2L,
1L, 1L, 1L, 1L), .Label = c("no", "yes"), class = "factor")), class = "data.frame", row.names = c(NA, -5L))
I want to use regex to identify the variable to use to group_by and to summarize my data efficiently. I cannot do separately because I have a large number of variables to summarize and the variable to group_by needs to be passed dynamically each time. data.table accepts using regex to pass the grouping variable, but not the summarizing variables. My attempts so far using tidyverse have been unsuccessful as well. Any help would be much appreciated.
My data:
tempDF <- structure(list(d1 = c("A", "B", "C", "A", "C"), d2 = c(40L, 50L, 20L, 50L, 20L),
d3 = c(20L, 40L, 50L, 40L, 50L), d4 = c(60L, 30L, 30L,60L, 30L), p_A = c(1L,
3L, 2L, 3L, 2L), p_B = c(3L, 4L, 3L, 3L, 4L), p_C = c(2L, 1L, 1L,2L, 1L), p4 = c(5L,
5L, 4L, 5L, 4L)), class = "data.frame", row.names = c(NA, -5L))
View(tempDF)
lLevels<-c("d1")
lContinuum<-c("p_A", "p_B", "p_C")
My attempts:
setDT(tempDF)[ , list(group_means = mean(eval((paste0(lContinuum)))), by=eval((paste0(lLevels))))]
group_means by
1: NA d1
Warning message:
In mean.default(eval((paste0(lContinuum)))) :
argument is not numeric or logical: returning NA
But a single variable works:
setDT(tempDF)[ , list(group_means = mean(p_A)), by=eval((paste0(lLevels)))]
setDT(tempDF)[ , list(group_means = mean(p_B)), by=eval((paste0(lLevels)))]
setDT(tempDF)[ , list(group_means = mean(p_C)), by=eval((paste0(lLevels)))]
Expected output:
tempDF %>%
group_by(d1) %>%
summarise(p_A_mean = mean(p_A), p_B_mean = mean(p_B), p_C_mean = mean(p_C))
# A tibble: 3 x 4
d1 p_A_mean p_B_mean p_C_mean
<chr> <dbl> <dbl> <dbl>
1 A 2 3 2
2 B 3 4 1
3 C 2 3.5 1
The data.table approach is very simple:
library(data.table)
setDT(tempDF)
tempDF[, lapply(.SD, mean),
by = lLevels,
.SDcols = lContinuum]
d1 p_A p_B p_C
1: A 2 3.0 2
2: B 3 4.0 1
3: C 2 3.5 1
Similar approach in dplyr would be:
library(dplyr)
tempDF%>%
group_by_at(lLevels)%>%
summarize_at(lContinuum, mean)
# A tibble: 3 x 4
d1 p_A p_B p_C
<chr> <dbl> <dbl> <dbl>
1 A 2 3 2
2 B 3 4 1
3 C 2 3.5 1
In either case, you can replace lLevels and lContinuum with regex. The dplyr option also would allow for select helpers such as starts_with() and ends_with():
https://www.rdocumentation.org/packages/tidyselect/versions/0.2.5/topics/select_helpers
.
Im sure this could be made more efficient / succinct but meets the spec:
summarise_df <- function(df, grouping_var){
# Store string of the grouping var name:
grouping_vec <- gsub(".*[$]", "", deparse(substitute(grouping_var)))
# split apply combine summary - return dataframe:
tmpdf_list <- lapply(split(df[,sapply(df, is.numeric)], df[,grouping_vec]),
function(x){sapply(x, function(y){mean(y)})})
}
tmp <- do.call(rbind, summarise_df(df, df$d1))
df <- data.frame(cbind(d1 = row.names(tmp), tmp), row.names = NULL)
With Summary vars dynamic too:
#
summarise_df <- function(df, grouping_var, summary_vars){
# Store string of the grouping var name:
grouping_vec <- gsub(".*[$]", "", deparse(substitute(grouping_var)))
# split apply combine summary - return dataframe:
tmpdf_list <- lapply(split(df[,summary_vars], df[,grouping_vec]),
function(x){sapply(x, function(y){mean(y)})})
}
tmp <- do.call(rbind, summarise_df(df, df$d1, c("p_A", "p_B", "p_C")))
tmp_df <- data.frame(cbind(d1 = row.names(tmp), tmp), row.names = NULL)
Though it looks a bit roundabout, reshaping this into a long form will allow to group by not only d1 but also by however many values of p_A ... p_C that are in the dataset.
edit: also added code to keep certain columns (d_cols) by regex.
library(tidyverse)
tempDF <- structure(
list(d1 = c("A", "B", "C", "A", "C"),
d2 = c(40L, 50L, 20L, 50L, 20L),
d3 = c(20L, 40L, 50L, 40L, 50L),
d4 = c(60L, 30L, 30L,60L, 30L),
d5 = c("AA", "BB", "CC", "AA", "CC"),
p_A = c(1L, 3L, 2L, 3L, 2L),
p_B = c(3L, 4L, 3L, 3L, 4L),
p_C = c(2L, 1L, 1L,2L, 1L),
p4 = c(5L, 5L, 4L, 5L, 4L)),
class = "data.frame",
row.names = c(NA, -5L))
# columns of d to keep, in strings
d_cols <- str_subset(colnames(tempDF), "d[15]")
tempDF %>%
pivot_longer(cols = matches("p_")) %>%
group_by(!!!syms(d_cols), name) %>%
summarize(mean = mean(value)) %>%
pivot_wider(id_cols = d_cols,
values_from = mean,
names_prefix = "mean_")
#> # A tibble: 3 x 5
#> # Groups: d1, d5 [3]
#> d1 d5 mean_p_A mean_p_B mean_p_C
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 A AA 2 3 2
#> 2 B BB 3 4 1
#> 3 C CC 2 3.5 1
Created on 2019-10-19 by the reprex package (v0.3.0)
Using the following dataframe I would like to group the data by replicate and group and then calculate a ratio of treatment values to control values.
structure(list(group = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L), .Label = c("case", "controls"), class = "factor"), treatment = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "EPA", class = "factor"),
replicate = structure(c(2L, 4L, 3L, 1L, 2L, 4L, 3L, 1L), .Label = c("four",
"one", "three", "two"), class = "factor"), fatty_acid_family = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "saturated", class = "factor"),
fatty_acid = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "14:0", class = "factor"),
quant = c(6.16, 6.415, 4.02, 4.05, 4.62, 4.435, 3.755, 3.755
)), .Names = c("group", "treatment", "replicate", "fatty_acid_family",
"fatty_acid", "quant"), class = "data.frame", row.names = c(NA,
-8L))
I have tried using dplyr as follows:
group_by(dataIn, replicate, group) %>% transmute(ratio = quant[group=="case"]/quant[group=="controls"])
but this results in Error: incompatible size (%d), expecting %d (the group size) or 1
Initially I thought this might be because I was trying to create 4 ratios from a df 8 rows deep and so I thought summarise might be the answer (collapsing each group to one ratio) but that doesn't work either (my understanding is a shortcoming).
group_by(dataIn, replicate, group) %>% summarise(ratio = quant[group=="case"]/quant[group=="controls"])
replicate group ratio
1 four case NA
2 four controls NA
3 one case NA
4 one controls NA
5 three case NA
6 three controls NA
7 two case NA
8 two controls NA
I would appreciate some advice on where I'm going wrong or even if this can be done with dplyr.
Thanks.
You can try:
group_by(dataIn, replicate) %>%
summarise(ratio = quant[group=="case"]/quant[group=="controls"])
#Source: local data frame [4 x 2]
#
# replicate ratio
#1 four 1.078562
#2 one 1.333333
#3 three 1.070573
#4 two 1.446449
Because you grouped by replicate and group, you could not access data from different groups at the same time.
#talat's answer solved for me. I created a minimal reproducible example to help my own understanding:
df <- structure(list(a = c("a", "a", "b", "b", "c", "c", "d", "d"),
b = c(1, 2, 1, 2, 1, 2, 1, 2), c = c(22, 15, 5, 0.2, 107,
6, 0.2, 4)), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
# a b c
# 1 a 1 22.0
# 2 a 2 15.0
# 3 b 1 5.0
# 4 b 2 0.2
# 5 c 1 107.0
# 6 c 2 6.0
# 7 d 1 0.2
# 8 d 2 4.0
library(dplyr)
df %>%
group_by(a) %>%
summarise(prop = c[b == 1] / c[b == 2])
# a prop
# 1 a 1.466667
# 2 b 25.000000
# 3 c 17.833333
# 4 d 0.050000