Using the following dataset:
temp <- structure(list(
GENDER = structure(c(1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L),
.Label = c("F", "M"),
class = "factor"),
EVERFSM_6 = c(0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L),
`0001` = c(0, 11, 22, 33, 33, 55, 66, 77, 88, 0),
n = c(20L, 13L, 4L, 13L, 36L, 94L, 28L, 50L, 27L, 1L)),
.Names = c("GENDER", "EVERFSM_6", "0001", "n"),
class = c("tbl_df", "data.frame"),
row.names = c(NA, -10L))
And I'm trying to perform the following spread_ operation to summarise the data:
DiscID <- "0001"
colID <- as.name(DiscID)
cols <- c("GENDER", colID, "n")
gender_results <- temp %>%
select_(.dots=cols) %>%
group_by_(.dots=cols[1:2]) %>%
summarise(gender_n = sum(n)) %>%
spread_(paste0("`",DiscID,"`"), "gender_n") %>%
rename(type = GENDER)
But it says:
Error: Key column '`0001`' does not exist in input.
I'm having to use the _ version of select_, group_by_ and spread_ as I am using a variable to refer to column names. The desired output is below, achievable by using the hard coded:
spread(`0001`, gender_n) %>%
type 0 11 22 33 55 66 77 88
(fctr) (int) (int) (int) (int) (int) (int) (int) (int)
1 F 20 13 4 36 94 28 NA NA
2 M 1 NA NA 13 NA NA 50 27
I think your DiscID is already ok for use with spread_, no need to paste. does this help?
cols <- c("GENDER", colID, "n")
gender_results <- temp %>%
select_(.dots=cols) %>%
group_by_(.dots=cols[1:2]) %>%
summarise(gender_n = sum(n)) %>%
spread_(DiscID, "gender_n") %>%
rename(type = GENDER)
alternatively:
cols <- list(~GENDER, colID, ~n)
gender_results <- temp %>%
select_(.dots=cols) %>%
group_by_(.dots=cols[1:2]) %>%
summarise(gender_n = sum(n)) %>%
spread_(DiscID, "gender_n") %>%
rename(type = GENDER)
From the NSE vignette in dplyr
Related
I am writing up my first paper. I have a data frame that has the study, symptoms, and the odds ratio that were analyzed for each symptom in each study. For example:
df <- structure(list(Study = c("Study1", "Study2", "Study1", "Study2", "Study1", "Study2"), Symptom = c("Symptom1", "Symptom1", "Symptom2", "Symptom2", "Symptom3", "Symptom3"), OR= c(1L, 0L, 1L, 0L, 1L, 0L), lower = c(-2L, -1L, -2L, -1L, -2L, -1L), upper = c(2L, 1L, 2L, 1L, 2L, 1L)), row.names = c(NA, + -6L), class = "data.frame")
I am wondering how to make a table for publication/what package to use that transforms the data and then prints a table that would look like:
df2 <- structure(list(Symptom = c("Symptom1", "Symptom2", "Symptom3"), Study1 = c("1(-2,2)", "1(-2,2)", "1(-2,2)"), Study2 = c("0(-1,1)", "0(-1,1)", "0(-1,1)")), row.names = c(NA, + -3L), class = "data.frame")
Thanks for the help!
library(dplyr)
library(tidyr)
df %>%
transmute(Study, Symptom, x = sprintf("%i(%i,%i)", OR, lower, upper)) %>%
pivot_wider(Symptom, names_from = Study, values_from = x)
# # A tibble: 3 x 3
# Symptom Study1 Study2
# <chr> <chr> <chr>
# 1 Symptom1 1(-2,2) 0(-1,1)
# 2 Symptom2 1(-2,2) 0(-1,1)
# 3 Symptom3 1(-2,2) 0(-1,1)
My df, Chap3, has ~50 categorical variables. I want to produce a frequency table for each categorical variable that also includes percentages. The code below works fine for the single var bsex but I cannot figure out how to repeat it for all categorical vars. Have tried using variants of apply, using select_if(is.factor), etc, to no avail.
Chap3 %>%
count(bsex) %>%
mutate(percent = round(n / sum(n) * 100,1))
For such cases it is better if you get the categorical data in long format.
library(dplyr)
library(tidyr)
Chap3 %>%
pivot_longer(cols = where(is.factor)) %>%
count(name, value) %>%
group_by(name) %>%
mutate(n = round(prop.table(n), 1)) %>%
ungroup
# name value n
# <chr> <fct> <dbl>
#1 bsex 0 0.4
#2 bsex 1 0.6
#3 csex 0 0.5
#4 csex 1 0.5
data
It is easier to help if you provide data in a reproducible format
set.seed(123)
Chap3 <- data.frame(id = 1:10,
bsex = factor(sample(c(1, 0), 10, replace = TRUE)),
csex = factor(sample(c(1, 0), 10, replace = TRUE)))
We may use table/proportions from base R
proportions(table(stack(type.convert(Chap3[-1], as.is = TRUE))), 2)
ind
values bsex csex
0 0.4 0.5
1 0.6 0.5
data
Chap3 <- structure(list(id = 1:10, bsex = structure(c(2L, 2L, 2L, 1L,
2L, 1L, 1L, 1L, 2L, 2L), .Label = c("0", "1"), class = "factor"),
csex = structure(c(1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L
), .Label = c("0", "1"), class = "factor")), class = "data.frame", row.names = c(NA,
-10L))
I have a data file with numeric values in three columns and two grouping variables (ID and Group) from which I need to calculate a single max value by ID and Group:
structure(list(ID = structure(c(1L, 1L, 1L, 2L), .Label = c("a1",
"a2"), class = "factor"), Group = structure(c(1L, 1L, 2L, 2L), .Label =
c("abc",
"def"), class = "factor"), Score1 = c(10L, 0L, 0L, 5L), Score2 = c(0L,
0L, 5L, 10L), Score3 = c(0L, 11L, 2L, 11L)), class = "data.frame", row.names =
c(NA,
-4L))
The result I am trying to obtain is:
structure(list(ID = structure(c(1L, 1L, 2L), .Label = c("a1",
"a2"), class = "factor"), Group = structure(c(1L, 2L, 2L), .Label = c("abc",
"def"), class = "factor"), Max = c(11L, 5L, 11L)), class = "data.frame",
row.names = c(NA,
-3L))
I am trying the following in dplyr:
SampTable<-SampDF %>% group_by(ID,Group) %>%
summarize(max = pmax(SampDF$Score1, SampDF$Score2,SampDF$Score3))
But it generates this error:
Error in summarise_impl(.data, dots) :
Column `max` must be length 1 (a summary value), not 4
Is there an easy way to achieve this in dplyr or data.table?
Solution using data.table. Find max value on 3:5 columns (Score columns) by ID and Group.
library(data.table)
setDT(d)
d[, .(Max = do.call(max, .SD)), .SDcols = 3:5, .(ID, Group)]
ID Group Max
1: a1 abc 11
2: a1 def 5
3: a2 def 11
Data:
d <- structure(list(ID = structure(c(1L, 1L, 1L, 2L), .Label = c("a1",
"a2"), class = "factor"), Group = structure(c(1L, 1L, 2L, 2L), .Label =
c("abc",
"def"), class = "factor"), Score1 = c(10L, 0L, 0L, 5L), Score2 = c(0L,
0L, 5L, 10L), Score3 = c(0L, 11L, 2L, 11L)), class = "data.frame", row.names =
c(NA,
-4L))
A solution using tidyverse.
library(tidyverse)
dat2 <- dat1 %>%
gather(Column, Value, starts_with("Score")) %>%
group_by(ID, Group) %>%
summarise(Max = max(Value)) %>%
ungroup()
dat2
# # A tibble: 3 x 3
# ID Group Max
# <fct> <fct> <dbl>
# 1 a1 abc 11
# 2 a1 def 5
# 3 a2 def 11
Here are couple of other options with tidyverse
library(tidyverse)
df1 %>%
group_by(ID, Group) %>%
nest %>%
mutate(Max = map_dbl(data, ~ max(unlist(.x)))) %>%
select(-data)
Or using pmax
df1 %>%
mutate(Max = pmax(!!! rlang::syms(names(.)[3:5]))) %>%
group_by(ID, Group) %>%
summarise(Max = max(Max))
# A tibble: 3 x 3
# Groups: ID [?]
# ID Group Max
# <fct> <fct> <dbl>
#1 a1 abc 11
#2 a1 def 5
#3 a2 def 11
Or using base R
aggregate(cbind(Max = do.call(pmax, df1[3:5])) ~ ID + Group, df1, max)
Here is a tidyverse solution using nest :
library(tidyverse)
df %>%
nest(-(1:2),.key="Max") %>%
mutate_at("Max",map_dbl, max)
# ID Group Max
# 1 a1 abc 11
# 2 a1 def 5
# 3 a2 def 11
In base R:
res <- aggregate(. ~ ID + Group,df,max)
res <- cbind(res[1:2], Max = do.call(pmax,res[-(1:2)]))
res
# ID Group Max
# 1 a1 abc 11
# 2 a1 def 5
# 3 a2 def 11
Here is a base R solution
# gives 2x2 table
x <- by(df[, !names(df) %in% c("ID", "Group")], list(df$ID, df$Group), max)
# get requested format
tmp <- expand.grid(ID = rownames(x), Group = colnames(x))
tmp$Max <- as.vector(x)
tmp[complete.cases(tmp), ]
#R ID Group Max
#R 1 a1 abc 11
#R 3 a1 def 5
#R 4 a2 def 11
with
df <- structure(list(
ID = structure(c(1L, 1L, 1L, 2L), .Label = c("a1", "a2"), class = "factor"),
Group = structure(c(1L, 1L, 2L, 2L), .Label = c("abc", "def"), class = "factor"),
Score1 = c(10L, 0L, 0L, 5L), Score2 = c(0L, 0L, 5L, 10L),
Score3 = c(0L, 11L, 2L, 11L)),
class = "data.frame", row.names = c(NA, -4L))
In R using dplyr I'm struggling to accumulate two columns through a sequence.
What I'd like to do:
Within each Outlet I'm trying to calculate the cumulative DFLSEcr (cumulative DFLSEcr = cumu_DFLSEcr) and count (cumu_count) for each row based on a sequence of ZHYD and NextDown. Each row has a value for NextDown which corresponds to the row which comes before it signified by a matching ZHYD. This makes a sequence in which DFLSEcr and count accumulate. Where Exutoire == 0 then cumu_DFLSEcr == 0 and cumu_count == 0. If DFLSEcr == 1 or NA then don't include it in the sum. I've used lag() but I don't think this is correct...
Input:
input <- structure(list(ZHYD = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L,
10L, 8L, 9L, 11L), .Label = c("B020006183", "B020006184", "B020006185",
"B020006190", "B020006199", "B020006212", "B020006228", "B020006278",
"B020006285", "B020006290", "B020006325"), class = "factor"),
Outlet = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 3L), .Label = c("BSO0001727", "BSO0001746", "BSO0001756"
), class = "factor"), NextDown = structure(c(1L, 1L, 2L,
2L, 3L, 3L, NA, NA, 4L, 4L, 5L), .Label = c("B020006190",
"B020006199", "B020006228", "B020006290", "B020006335"), class = "factor"),
count = c(15L, 55L, 42L, 19L, 32L, 6L, 19L, 49L, 4L, 82L,
5L), DFLSEcr = c(0.07, 0.02, 0.02, 0.05, 0.03, 0.17, 0.05,
0.02, 0.25, 0.01, NA), Exutoire = c(0L, 0L, 0L, 0L, 0L, 0L,
1L, 1L, 0L, 0L, 0L)), class = "data.frame", row.names = c(NA,
-11L), .Names = c("ZHYD", "Outlet", "NextDown", "count", "DFLSEcr",
"Exutoire"))
Method tried so far:
input %>%
group_by(Outlet) %>%
select(ZHYD, NextDown, ZHYD, Outlet, Exutoire, count, DFLSEcr) %>%
mutate(cleanDFLSEcr = replace(DFLSEcr, DFLSEcr == 1, 0),
cleanDFLSEcr = replace(DFLSEcr, is.na(DFLSEcr), 0),
cumu_DFLSEcr = if_else(Exutoire == 1, 0, cumsum(cleanDFLSEcr)) %>% lag(1, default = 0),
cumu_count = if_else(Exutoire == 1, 0, as.numeric(cumsum(count))) %>% lag(1, default = 0)) %>%
select(-cleanDFLSEcr)
Desired output:
ZHYD Outlet NextDown count DFLSEcr Exutoire cumu_count cumu_DFLSEcr
<fct> <fct> <fct> <int> <dbl> <int> <int> <dbl>
1 B020006183 BSO0001727 B020006190 15 0.0700 0 70 0.130
2 B020006184 BSO0001727 B020006190 55 0.0200 0 70 0.130
3 B020006185 BSO0001727 B020006199 42 0.0200 0 51 0.0800
4 B020006190 BSO0001727 B020006199 19 0.0500 0 51 0.0800
5 B020006199 BSO0001727 B020006228 32 0.0300 0 19 0.0500
6 B020006212 BSO0001727 B020006228 6 0.170 0 19 0.0500
7 B020006228 BSO0001727 <NA> 19 0.0500 1 0 0.
8 B020006290 BSO0001746 <NA> 49 0.0200 1 0 0.
9 B020006278 BSO0001746 B020006290 4 0.250 0 1 0.0200
10 B020006285 BSO0001746 B020006290 82 0.0100 0 1 0.0200
11 B020006325 BSO0001756 B020006335 5 NA 0 1 0.0200
cumu_count and cumu_DFLSEcr outputs are the same in some cases because they share the same NextDown.
The final row in the sequence shouldn't be included in cumu_count. So for the first row cumu_count == 19 +32 + 19 = 70
Edit
Turns out I needed igraph as this is a routing problem.
Ok your problem is not simple. You have nested data which is a bit complicated to manipulate.
I give you an answer. There are for sure better ways to do it. But it can give you some ideas.
library(dplyr)
library(tidyr)
father_son_table <- select(input, actual = ZHYD, father = NextDown)
sequences <- rename(input, actual = ZHYD, father = NextDown) %>%
left_join(father_son_table, by = c("father" = "actual"), suffix= c(".1", ".2")) %>%
left_join(father_son_table, by = c("father.2" = "actual"), suffix = c(".1", ".3")) %>%
tibble::rowid_to_column(var = "sequence_number")
table_order <- sequences %>%
select(-count, -DFLSEcr, -Exutoire, -Outlet) %>%
gather(key = height, value = node, -sequence_number) %>%
mutate(order = case_when( height == "actual" ~ 0,
height =="father.1" ~ 1,
height == "father.2" ~ 2,
height == "father.3" ~ 3 )) %>%
na.omit() %>%
select(sequence_number, node, order)
result <- left_join(table_order, input, by = c("node" = "ZHYD")) %>%
arrange(sequence_number, order) %>%
group_by(sequence_number, Outlet) %>%
mutate(cumu_count = sum(count) - count,
cumu_DFLSE_cr = sum(DFLSEcr)- DFLSEcr) %>%
filter(order == 0)
Following this question and this one, I wondered what was the best option to summarise categorical variables in one dataset.
I have a dataset such as
# A tibble: 10 <U+00D7> 4
empstat_couple nssec7_couple3 nchild07 age_couple
<chr> <fctr> <fctr> <dbl>
1 Neo-Trad Lower Managerial 1child 39
2 Neo-Trad Higher Managerial 1child 31
3 Neo-Trad Manual and Routine 1child 33
4 Trad Higher Managerial 1child 43
The 3 first variables are categorical (character or factor) and the last numerical.
What I would like is something like (output)
var n p
1: Neo-Trad 6 0.6
2: OtherArrangment 2 0.2
3: Trad 2 0.2
4: Higher Managerial 4 0.4
5: Lower Managerial 5 0.5
6: Manual and Routine 1 0.1
7: 1child 9 0.9
8: 2children 1 0.1
Well for the numerical variable, I am unsure how to add it meaningfully to the summary.
I guess the most basic way to go is
library(dplyr)
library(data.table)
a = count(dt, empstat_couple) %>% mutate(p = n / sum(n))
b = count(dt, nssec7_couple3) %>% mutate(p = n / sum(n))
c = count(dt, nchild07) %>% mutate(p = n / sum(n))
rbindlist(list(a,b,c))
I wondered if a summarise_each solution existed ?
This doesn't work
dt %>% summarise_each(funs(count))
Using apply I could come up with this
apply(dt, 2, as.data.frame(table)) %>% rbindlist()
But it's not great.
Any suggestions ?
data
dt = structure(list(empstat_couple = c("Neo-Trad", "Neo-Trad", "Neo-Trad",
"Trad", "OtherArrangment", "Neo-Trad", "Trad", "OtherArrangment",
"Neo-Trad", "Neo-Trad"), nssec7_couple3 = structure(c(2L, 1L,
4L, 1L, 2L, 2L, 1L, 2L, 1L, 2L), .Label = c("Higher Managerial",
"Lower Managerial", "Intermediate", "Manual and Routine"), class = "factor"),
nchild07 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L), .Label = c("1child", "2children", ">2children"), class = "factor"),
age_couple = c(39, 31, 33, 43, 32, 28, 28, 40, 33, 26), hldid = 1:10), .Names = c("empstat_couple",
"nssec7_couple3", "nchild07", "age_couple", "hldid"), row.names = c(NA,
-10L), class = "data.frame")
We can melt with data.table and get the .N and proportion
library(data.table)
unique(melt(setDT(dt), id.var = "age_couple")[, n := .N , value],
by = c("variable", "value", "n"))[, p := n/sum(n), variable
][, c("age_couple", "variable" ) := NULL][]
Or using dplyr/tidyr
library(dplyr)
library(tidyr)
gather(dt, var1, var, -age_couple) %>%
group_by(var) %>%
mutate(n = n()) %>%
select(-age_couple) %>%
unique() %>%
group_by(var1) %>%
mutate(p= n/sum(n)) %>%
ungroup() %>%
select(-var1)