I am writing up my first paper. I have a data frame that has the study, symptoms, and the odds ratio that were analyzed for each symptom in each study. For example:
df <- structure(list(Study = c("Study1", "Study2", "Study1", "Study2", "Study1", "Study2"), Symptom = c("Symptom1", "Symptom1", "Symptom2", "Symptom2", "Symptom3", "Symptom3"), OR= c(1L, 0L, 1L, 0L, 1L, 0L), lower = c(-2L, -1L, -2L, -1L, -2L, -1L), upper = c(2L, 1L, 2L, 1L, 2L, 1L)), row.names = c(NA, + -6L), class = "data.frame")
I am wondering how to make a table for publication/what package to use that transforms the data and then prints a table that would look like:
df2 <- structure(list(Symptom = c("Symptom1", "Symptom2", "Symptom3"), Study1 = c("1(-2,2)", "1(-2,2)", "1(-2,2)"), Study2 = c("0(-1,1)", "0(-1,1)", "0(-1,1)")), row.names = c(NA, + -3L), class = "data.frame")
Thanks for the help!
library(dplyr)
library(tidyr)
df %>%
transmute(Study, Symptom, x = sprintf("%i(%i,%i)", OR, lower, upper)) %>%
pivot_wider(Symptom, names_from = Study, values_from = x)
# # A tibble: 3 x 3
# Symptom Study1 Study2
# <chr> <chr> <chr>
# 1 Symptom1 1(-2,2) 0(-1,1)
# 2 Symptom2 1(-2,2) 0(-1,1)
# 3 Symptom3 1(-2,2) 0(-1,1)
Related
My df, Chap3, has ~50 categorical variables. I want to produce a frequency table for each categorical variable that also includes percentages. The code below works fine for the single var bsex but I cannot figure out how to repeat it for all categorical vars. Have tried using variants of apply, using select_if(is.factor), etc, to no avail.
Chap3 %>%
count(bsex) %>%
mutate(percent = round(n / sum(n) * 100,1))
For such cases it is better if you get the categorical data in long format.
library(dplyr)
library(tidyr)
Chap3 %>%
pivot_longer(cols = where(is.factor)) %>%
count(name, value) %>%
group_by(name) %>%
mutate(n = round(prop.table(n), 1)) %>%
ungroup
# name value n
# <chr> <fct> <dbl>
#1 bsex 0 0.4
#2 bsex 1 0.6
#3 csex 0 0.5
#4 csex 1 0.5
data
It is easier to help if you provide data in a reproducible format
set.seed(123)
Chap3 <- data.frame(id = 1:10,
bsex = factor(sample(c(1, 0), 10, replace = TRUE)),
csex = factor(sample(c(1, 0), 10, replace = TRUE)))
We may use table/proportions from base R
proportions(table(stack(type.convert(Chap3[-1], as.is = TRUE))), 2)
ind
values bsex csex
0 0.4 0.5
1 0.6 0.5
data
Chap3 <- structure(list(id = 1:10, bsex = structure(c(2L, 2L, 2L, 1L,
2L, 1L, 1L, 1L, 2L, 2L), .Label = c("0", "1"), class = "factor"),
csex = structure(c(1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L
), .Label = c("0", "1"), class = "factor")), class = "data.frame", row.names = c(NA,
-10L))
In the below data set I want to first check which rows have same value for both column U and T. For all such rows I want to calculate mean of Mean column, min of Min column and max of Max column.
I can do this easily if the rows with same value of column U and T are separate data.frame(), but for this case I first need to extract all such sub data.frame() with in data.frame() and then perform the operation.
If anyone has a better approach using R libraries please suggest?
Input Data
data <- structure(list(A = c(0.1, 0.1, 0.1, 0.1), B = c(NA, NA, NA, NA
), C = structure(c(1L, 1L, 1L, 1L), .Label = "Yes", class = "factor"),
U = c(11L, 11L, 11L, 11L), T = structure(c(1L, 1L, 1L, 1L
), .Label = "A", class = "factor"), P = structure(c(1L, 1L,
1L, 1L), .Label = "INT", class = "factor"), Q = 1:4, R = c(0L,
0L, 0L, 0L), S = c(1L, 1L, 1L, 1L), W = structure(c(1L, 1L,
1L, 1L), .Label = "A", class = "factor"), Mean = c(21.208,
21.22333333, 21.23666667, 21.174), Min = c(21.02, 21.01,
21.09, 21.02), Max = c(21.35, 21.39, 21.47, 21.36)), class = "data.frame", row.names = c(NA,
-4L))
Expected Output
A B C U T P Q R S T Mean Min Max
0.1 NA Yes 11 A INT 4 0 1 A 21.2105 21.01 21.47
We can use
library(tidyverse)
data %>%
group_by(U, T) %>%
mutate(Mean = mean(Mean), Min = min(Min), Max = max(Max))%>%
slice(1)
nm = names(data)[!names(data) %in% c("Mean", "Min", "Max")]
do.call(rbind, lapply(split(data, paste(data$U, data$T)), function(x){
data.frame(x[1, nm], Mean = mean(x$Mean), Min = min(x$Min), Max = max(x$Max))
}))
# A B C U T P Q R S W Mean Min Max
#11 A 0.1 NA Yes 11 A INT 1 0 1 A 21.2105 21.01 21.47
Assuming these are few timestamped observations in a dataset:
Id Status DateCreated Group
10 Read 2017-11-04 18:24:55 Red
10 Write 2017-11-04 18:24:56 Red
10 Review 2017-11-04 18:25:16 Red
10 Read 2017-11-04 18:26:17 Red
10 Write 2017-11-04 18:26:47 Red
How do I collapse rows that are within 1 minute of each other?
For example, rows 1,2,3 are collapsed into 1 row and rows 4 and 5 are collapsed into second row.
The expected output would look like this:
Id Status DateCreated Date Ended Group
10 Read,Write,Review 2017-11-04 18:24:55 2017-11-04 18:25:16 Red, Red, Red
10 Read,Write 2017-11-04 18:26:17 2017-11-04 18:26:47 Red, Red
Here is the code to reproduce the test dataset in this example:
df <- structure(list(Id = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "10", class = "factor"),
Status = structure(c(1L, 3L, 2L, 1L, 3L), .Label = c("Read",
"Review", "Write"), class = "factor"), DateCreated = structure(1:5, .Label = c("2017-11-04 18:24:55",
"2017-11-04 18:24:56", "2017-11-04 18:25:16", "2017-11-04 18:26:17",
"2017-11-04 18:26:47"), class = "factor"), Group = structure(c(1L,
1L, 1L, 1L, 1L), .Label = "Red", class = "factor")), class = "data.frame", row.names = c(NA,
-5L))
I would do something like that:
df %>%
mutate(DateCreated = ymd_hms(DateCreated))%>%
group_by(minute(DateCreated))%>%
arrange(DateCreated)%>%
summarise(Status = paste(Status,collapse = ", "),DateCreated = DateCreated[1],Date_ended = last(DateCreated),Group = paste(Group,collapse = ", "))
library(lubridate)
library(dplyr)
library(purrr)
df <-
structure(
list(
Id = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "10", class = "factor"),
Status = structure(
c(1L, 3L, 2L, 1L, 3L),
.Label = c("Read",
"Review", "Write"),
class = "factor"
),
DateCreated = structure(
1:5,
.Label = c(
"2017-11-04 18:24:55",
"2017-11-04 18:24:56",
"2017-11-04 18:25:16",
"2017-11-04 18:26:17",
"2017-11-04 18:26:47"
),
class = "factor"
),
Group = structure(c(1L,
1L, 1L, 1L, 1L), .Label = "Red", class = "factor")
),
class = "data.frame",
row.names = c(NA,-5L)
)
df2 <-
df %>%
mutate(DateCreated = as_datetime(df$DateCreated)) %>%
arrange(DateCreated) %>%
mutate(diff = DateCreated - lag(DateCreated))
df2$diff[1] <- 0L
g <- 0
df3 <- mutate(df2, date_groups =
accumulate(df2$diff, function(x, y)
if (y - x < 60)
g
else {
g <<- g + 1
})) %>%
group_by(date_groups) %>%
summarise(
Status = paste(Status, collapse = ", "),
DateCreated = DateCreated[1],
Date_ended = last(DateCreated),
Group = paste(Group, collapse = ", ")
)
df3
#> # A tibble: 2 x 5
#> date_groups Status DateCreated Date_ended Group
#> <dbl> <chr> <dttm> <dttm> <chr>
#> 1 0 Read, Write… 2017-11-04 18:24:55 2017-11-04 18:24:55 Red, Re…
#> 2 1 Read, Write 2017-11-04 18:26:17 2017-11-04 18:26:17 Red, Red
Created on 2019-01-28 by the reprex package (v0.2.1)
I have a data file with numeric values in three columns and two grouping variables (ID and Group) from which I need to calculate a single max value by ID and Group:
structure(list(ID = structure(c(1L, 1L, 1L, 2L), .Label = c("a1",
"a2"), class = "factor"), Group = structure(c(1L, 1L, 2L, 2L), .Label =
c("abc",
"def"), class = "factor"), Score1 = c(10L, 0L, 0L, 5L), Score2 = c(0L,
0L, 5L, 10L), Score3 = c(0L, 11L, 2L, 11L)), class = "data.frame", row.names =
c(NA,
-4L))
The result I am trying to obtain is:
structure(list(ID = structure(c(1L, 1L, 2L), .Label = c("a1",
"a2"), class = "factor"), Group = structure(c(1L, 2L, 2L), .Label = c("abc",
"def"), class = "factor"), Max = c(11L, 5L, 11L)), class = "data.frame",
row.names = c(NA,
-3L))
I am trying the following in dplyr:
SampTable<-SampDF %>% group_by(ID,Group) %>%
summarize(max = pmax(SampDF$Score1, SampDF$Score2,SampDF$Score3))
But it generates this error:
Error in summarise_impl(.data, dots) :
Column `max` must be length 1 (a summary value), not 4
Is there an easy way to achieve this in dplyr or data.table?
Solution using data.table. Find max value on 3:5 columns (Score columns) by ID and Group.
library(data.table)
setDT(d)
d[, .(Max = do.call(max, .SD)), .SDcols = 3:5, .(ID, Group)]
ID Group Max
1: a1 abc 11
2: a1 def 5
3: a2 def 11
Data:
d <- structure(list(ID = structure(c(1L, 1L, 1L, 2L), .Label = c("a1",
"a2"), class = "factor"), Group = structure(c(1L, 1L, 2L, 2L), .Label =
c("abc",
"def"), class = "factor"), Score1 = c(10L, 0L, 0L, 5L), Score2 = c(0L,
0L, 5L, 10L), Score3 = c(0L, 11L, 2L, 11L)), class = "data.frame", row.names =
c(NA,
-4L))
A solution using tidyverse.
library(tidyverse)
dat2 <- dat1 %>%
gather(Column, Value, starts_with("Score")) %>%
group_by(ID, Group) %>%
summarise(Max = max(Value)) %>%
ungroup()
dat2
# # A tibble: 3 x 3
# ID Group Max
# <fct> <fct> <dbl>
# 1 a1 abc 11
# 2 a1 def 5
# 3 a2 def 11
Here are couple of other options with tidyverse
library(tidyverse)
df1 %>%
group_by(ID, Group) %>%
nest %>%
mutate(Max = map_dbl(data, ~ max(unlist(.x)))) %>%
select(-data)
Or using pmax
df1 %>%
mutate(Max = pmax(!!! rlang::syms(names(.)[3:5]))) %>%
group_by(ID, Group) %>%
summarise(Max = max(Max))
# A tibble: 3 x 3
# Groups: ID [?]
# ID Group Max
# <fct> <fct> <dbl>
#1 a1 abc 11
#2 a1 def 5
#3 a2 def 11
Or using base R
aggregate(cbind(Max = do.call(pmax, df1[3:5])) ~ ID + Group, df1, max)
Here is a tidyverse solution using nest :
library(tidyverse)
df %>%
nest(-(1:2),.key="Max") %>%
mutate_at("Max",map_dbl, max)
# ID Group Max
# 1 a1 abc 11
# 2 a1 def 5
# 3 a2 def 11
In base R:
res <- aggregate(. ~ ID + Group,df,max)
res <- cbind(res[1:2], Max = do.call(pmax,res[-(1:2)]))
res
# ID Group Max
# 1 a1 abc 11
# 2 a1 def 5
# 3 a2 def 11
Here is a base R solution
# gives 2x2 table
x <- by(df[, !names(df) %in% c("ID", "Group")], list(df$ID, df$Group), max)
# get requested format
tmp <- expand.grid(ID = rownames(x), Group = colnames(x))
tmp$Max <- as.vector(x)
tmp[complete.cases(tmp), ]
#R ID Group Max
#R 1 a1 abc 11
#R 3 a1 def 5
#R 4 a2 def 11
with
df <- structure(list(
ID = structure(c(1L, 1L, 1L, 2L), .Label = c("a1", "a2"), class = "factor"),
Group = structure(c(1L, 1L, 2L, 2L), .Label = c("abc", "def"), class = "factor"),
Score1 = c(10L, 0L, 0L, 5L), Score2 = c(0L, 0L, 5L, 10L),
Score3 = c(0L, 11L, 2L, 11L)),
class = "data.frame", row.names = c(NA, -4L))
Using the following dataset:
temp <- structure(list(
GENDER = structure(c(1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L),
.Label = c("F", "M"),
class = "factor"),
EVERFSM_6 = c(0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L),
`0001` = c(0, 11, 22, 33, 33, 55, 66, 77, 88, 0),
n = c(20L, 13L, 4L, 13L, 36L, 94L, 28L, 50L, 27L, 1L)),
.Names = c("GENDER", "EVERFSM_6", "0001", "n"),
class = c("tbl_df", "data.frame"),
row.names = c(NA, -10L))
And I'm trying to perform the following spread_ operation to summarise the data:
DiscID <- "0001"
colID <- as.name(DiscID)
cols <- c("GENDER", colID, "n")
gender_results <- temp %>%
select_(.dots=cols) %>%
group_by_(.dots=cols[1:2]) %>%
summarise(gender_n = sum(n)) %>%
spread_(paste0("`",DiscID,"`"), "gender_n") %>%
rename(type = GENDER)
But it says:
Error: Key column '`0001`' does not exist in input.
I'm having to use the _ version of select_, group_by_ and spread_ as I am using a variable to refer to column names. The desired output is below, achievable by using the hard coded:
spread(`0001`, gender_n) %>%
type 0 11 22 33 55 66 77 88
(fctr) (int) (int) (int) (int) (int) (int) (int) (int)
1 F 20 13 4 36 94 28 NA NA
2 M 1 NA NA 13 NA NA 50 27
I think your DiscID is already ok for use with spread_, no need to paste. does this help?
cols <- c("GENDER", colID, "n")
gender_results <- temp %>%
select_(.dots=cols) %>%
group_by_(.dots=cols[1:2]) %>%
summarise(gender_n = sum(n)) %>%
spread_(DiscID, "gender_n") %>%
rename(type = GENDER)
alternatively:
cols <- list(~GENDER, colID, ~n)
gender_results <- temp %>%
select_(.dots=cols) %>%
group_by_(.dots=cols[1:2]) %>%
summarise(gender_n = sum(n)) %>%
spread_(DiscID, "gender_n") %>%
rename(type = GENDER)
From the NSE vignette in dplyr