Adding a Proportion Column with Dplyr - r

Let's say I had the following data frame, that was also altered to include counts of a,b, and c, based on whether or not they are classified by Z = 0 or 1
X <- (1:10)
Y<- c('a','b','a','c','b','b','a','a','c','c')
Z <- c(0,1,1,1,0,1,0,1,1,1)
test_df <- data.frame(X,Y,Z)
(the code below was provided by a stack exchange member, thank you!)
res <- test_df %>% group_by(Y,Z) %>% summarise(N=n()) %>%
pivot_wider(names_from = Z,values_from=N,
values_fill = 0)
How might I add a column on the right which would indicate the proportion of each of the letters for which z=1, out of all appearances of that letter? It would seem that a basic summary statement should work but I figure out how...
My expected output would be something like
Z=0 Z=1 PropZ=1
a 2 2 .5
b 1 2 .66
c 0 3 1

Perhaps this helps
library(dplyr)
library(tidyr)
test_df %>%
group_by(Y, Z) %>%
summarise(N = n(), .groups = 'drop') %>%
left_join(test_df %>%
group_by(Y) %>%
summarise(Prop = mean(Z == 1), .groups = 'drop')) %>%
pivot_wider(names_from = Z, values_from = N, values_fill = 0)
-output
# A tibble: 3 x 4
# Y Prop `0` `1`
# <chr> <dbl> <int> <int>
#1 a 0.5 2 2
#2 b 0.667 1 2
#3 c 1 0 3

test_df %>% group_by(Y) %>%
summarise( z0 = sum(Z == 0), z1 = sum(Z == 1) , PropZ = z1/n())

I am not sure if what is your expected output, but below might be some options
u <- xtabs(q ~ Y + Z, cbind(test_df, q = 1))
> u
Z
Y 0 1
a 2 2
b 1 2
c 0 3
or
> prop.table(u)
Z
Y 0 1
a 0.2 0.2
b 0.1 0.2
c 0.0 0.3

To calculate proportions of 1 for each letter you can use rowSums.
transform(res, prop_1 = `1`/rowSums(res[-1]))
In dplyr :
library(dplyr)
res %>%
ungroup %>%
mutate(prop_1 = `1`/rowSums(.[-1]))
# Y `0` `1` prop_1
# <chr> <int> <int> <dbl>
#1 a 2 2 0.5
#2 b 1 2 0.667
#3 c 0 3 1

Related

Summarize one variable/column over all possible values of other variables/columns

I need to summarize one variable/column of a long table after aggregating (group_by()) by another variable/column, I need to have the summarized value by all values of other variables/columns.
Here is test data:
library(tidyverse)
set.seed(123)
Site <- str_c("S", 1:5)
Species <- str_c("Sps", 1:6)
print(Species_tbl <- bind_cols(Species = Species,
Exotic = rbinom(length(Species), 1, .3),
Migrant = rbinom(length(Species), 2, .3)))
Data_tbl <- expand.grid(Site = Site,
Species = Species) %>%
left_join(Species_tbl)
Data_tbl$Presence <- rbinom(nrow(Data_tbl), 1, .5)
And here is my best effort:
print(Data_tbl %>%
group_by(Site) %>%
summarise(N_sp = sum(Presence),
N_sp_Exo = sum(Presence[Exotic == 1]),
N_sp_Nat = sum(Presence[Exotic == 0]),
N_sp_M0 = sum(Presence[Migrant == 0]),
N_sp_M1 = sum(Presence[Migrant == 1]),
N_sp_M2 = sum(Presence[Migrant == 2])))
You can get the data in long format for your columns of interest c(Exotic, Migrant) and take sum of Presence columns for each unique column names and it's values. This can be merged with sum of each Site.
library(dplyr)
library(tidyr)
data1 <- Data_tbl %>%
group_by(Site) %>%
summarise(N_sp = sum(Presence))
data2 <- Data_tbl %>%
pivot_longer(cols = c(Exotic, Migrant)) %>%
group_by(Site, name, value) %>%
summarise(result = sum(Presence), .groups = "drop") %>%
pivot_wider(names_from = c(name, value), values_from = result)
inner_join(data1, data2, by = 'Site')
# Site N_sp Exotic_0 Exotic_1 Migrant_0 Migrant_1 Migrant_2
# <fct> <int> <int> <int> <int> <int> <int>
#1 S1 4 2 2 1 2 1
#2 S2 3 2 1 0 2 1
#3 S3 2 1 1 0 2 0
#4 S4 4 2 2 1 3 0
#5 S5 4 1 3 1 2 1
The answer has been divided in two steps for ease of readability. If you would like to do this in a single chain without creating temporary variables that can be done as well.

creating dataframe from vectors

enter image description hereI have the following vectors:
bid = c(1,5,10,20,30,40,50)
n = c(31,29,27,25,23,21,19)
yes = c(0,3,6,7,9,13,17)
no = n - yes
I have two questions, and I don't find any solutions for them, I would appreciate if someone can help me.
Q1: I want to write R code to create a two-column dataframe df. Column 1 has Bid,
where each Bid is repeated n times; Column 2 has c(rep(1,yes),rep(0,no) at
each bid.
Q2: Then when I have the data frame df, I want to write R codes to generate
(from df) vectors bid, n, yes, and no, again.
It is a bit unclear what you actually want. It is easier if you provide the desired result. Would this fit your Q1:
library(tidyverse)
bid = c(1,5,10,20,30,40,50)
n = c(31,29,27,25,23,21,19)
yes = c(0,3,6,7,9,13,17)
no = n - yes
df <- tibble(bid, yes, n, no = n -yes) %>% dplyr::select(- n) %>% pivot_longer(cols = c(yes, no)) %>% uncount(value) %>% mutate(yesno = ifelse(name == "yes", 1,0)) %>% dplyr::select(-name)
df2 <- df %>% group_by(bid) %>% table() %>% as.data.frame() %>% pivot_wider(id_cols = bid, names_from = yesno, values_from = Freq) %>% mutate(n = yes + no) %>% rename(no = `0`, yes = `1`)
bid <- df2$bid
n <- df2$n
yes <- df2$yes
I don't know what you mean for Q2, but for Q1 you could do this:
library(tidyverse)
pmap_dfr(list(bid, n, yes, no),
\(V1, V2, V3, V4) tibble(col1 = rep(V1, V2),
col2 = c(rep(1,V3),rep(0,V4))))
#> # A tibble: 175 x 2
#> col1 col2
#> <dbl> <dbl>
#> 1 1 0
#> 2 1 0
#> 3 1 0
#> 4 1 0
#> 5 1 0
#> 6 1 0
#> 7 1 0
#> 8 1 0
#> 9 1 0
#> 10 1 0
#> # ... with 165 more rows
EDIT:
For Q2, you can follow this:
library(tidyverse)
df <- pmap_dfr(list(bid, n, yes, no),
\(V1, V2, V3, V4) tibble(col1 = rep(V1, V2),
col2 = c(rep(1,V3),rep(0,V4))))
df2 <- df |>
count(col1, col2) |>
group_by(col1) |>
summarise(yes = sum(n[col2==1]),
n = sum(n))
bid2 <- df2$col1
n2 <- df2$n
yes2 <- df2$yes
no2 <- n2 - yes2
all.equal(c(bid, n, yes, no), c(bid2, n2, yes2, no2))
#> [1] TRUE

mutate if feature exists else NA_real_

A dataframe:
exdf <- data.frame(
a = 1:3,
b = c(2,2,2)
)
Sometimes b is present, in which case one can do this:
exdf %>% mutate(c = a / b)
But, sometimes feature b will not be present, in which case:
exdf %>% select(-b) %>% mutate(c = a / b)
Error: Problem with `mutate()` input `c`.
x object 'b' not found
ℹ Input `c` is `a/b`.
I want to tell dplyr to try the mutation, else if something goes wrong just make new feature c all NA_real_ as opposed to a / b.
Can this be done?
We can use a condition with if/else on exists
library(dplyr)
exdf %>%
select(-b) %>%
mutate(c = if(exists('b')) a/b else NA_real_)
Set up a simple if else statement within mutate which checks whether the column name is in the data.frame or not.
> exdf %>%
... dplyr::rowwise() %>%
... dplyr::mutate(q = ifelse("b" %in% colnames(.), a/b, NA_real_))
# A tibble: 3 x 3
# Rowwise:
a b q
<int> <dbl> <dbl>
1 1 2 0.5
2 2 2 1
3 3 2 1.5
> exdf %>%
... dplyr::select(-b) %>%
... dplyr::rowwise() %>%
... dplyr::mutate(q = ifelse("b" %in% colnames(.), a/b, NA_real_))
# A tibble: 3 x 2
# Rowwise:
a q
<int> <dbl>
1 1 NA
2 2 NA
3 3 NA

R Summarize With DataTable

data=data.frame("StudentID"=c(1,2,3,4,5),
"Class"=c(1,2,2,3,3),
"Type"=c('A','A','B','B','B'))
Say you have data as shown above and you wish for summaries like this,
What is the effective solution to do this and output to a csv in organized way such as shown above?
Example data if there is weights involved and you wanted weighted counts and porporitons.portions.
data1=data.frame("StudentID"=c(1,2,3,4,5),
"Class"=c(1,2,2,3,3),
"Type"=c('A','A','B','B','B'),
"Weighting"=c(10,6,13,12,2))
One option is map
library(dplyr)
library(purrr)
map_dfr(names(data)[2:3], ~
data %>%
select(.x) %>%
group_by_at(.x) %>%
summarise(COUNT = n()) %>%
mutate(PROP = COUNT/sum(COUNT)))
# A tibble: 5 x 4
# Class COUNT PROP Type
#* <dbl> <int> <dbl> <fct>
#1 1 1 0.2 <NA>
#2 2 2 0.4 <NA>
#3 3 2 0.4 <NA>
#4 NA 2 0.4 A
#5 NA 3 0.6 B
Or with data.table by melting into 'long' format
library(data.table)
melt(setDT(data), id.var = 'StudentID')[, .(COUNT = .N),
.(variable, value)][, PROP := COUNT/sum(COUNT),.(variable)][]
Or with base R using table and prop.table
lapply(data[-1], function(x) {x1 <- table(x); x2 <- prop.table(x1); cbind(COUNT = x1, PROP = x2)})
Both summaries are simple, here I use dplyr. To combine them in the way you want, it's going to need to be slapped together in a somewhat inelegant way. You can remove the name col1 if you want
library(dplyr)
df1 <- data %>% group_by(Class) %>%
summarise(Count = n(), Prop = n() / nrow(data))
df2 <- data %>% group_by(Type) %>%
summarise(Count = n(), Prop = n() / nrow(data))
names(df1)[1] <- 'col1'
names(df2)[1] <- 'col1'
rbind(
c('Class', '', ''),
df1,
c('Type', '', ''),
df2
)
# A tibble: 7 x 3
col1 Count Prop
<chr> <chr> <chr>
1 Class "" ""
2 1 1 0.2
3 2 2 0.4
4 3 2 0.4
5 Type "" ""
6 A 2 0.4
7 B 3 0.6

summaries extracted from data frame info

data <-
STUDY ID BASE CYCLE1 DIED PROG
1 1 100 30 No Yes
1 2 NA 20 Yes No
1 3 16 NA Yes Yes
1 4 15 10 Yes Yes
I wanted to make a summary of the following:
how many subjects have both baseline and CYCLE1 value?
Of those in 1, how many had DIED?
Of those in 1, how many had DIED or PROG?
Answers:
2-subjects (50% of subjects) ==> subjects 1 & 4
1-subject (25%) ===> this is subject 4
2-subjects (50%) ==> subjectys 1 & 4
A summary table by STUDY for this would be great (showing the number and percentage).
I am using Rstudio.
If it is based on the first filter
library(dplyr)
library(stringr)
data %>%
group_by(STUDY) %>%
filter(!is.na(BASE) & !is.na(CYCLE1)) %>%
summarise(ID = str_c(ID, collapse=", "),
n1 = n(),
n2 = sum(DIED== "Yes"),
n3 = sum(DIED == "Yes"|PROG == "Yes"))
# A tibble: 1 x 5
# STUDY ID n1 n2 n3
# <int> <chr> <int> <int> <int>
#1 1 1, 4 2 1 2
if we need the percentage as well
out <- data %>%
group_by(STUDY) %>%
mutate(i1 = !is.na(BASE) & !is.na(CYCLE1),
perc1 = 100 * mean(i1),
n1 = sum(i1),
i2 = DIED == "Yes" & i1,
perc2 = 100 * mean(i2),
n2 = sum(i2),
i3 = (DIED == "Yes"|PROG == "Yes") & i1,
perc3 = 100 * mean(i3),
n3 = sum(i3)) %>%
filter(i1) %>%
select(STUDY, ID, matches("perc"), matches("n")) %>%
mutate(ID = toString(ID)) %>%
slice(1)
# A tibble: 1 x 8
# Groups: STUDY [1]
# STUDY ID perc1 perc2 perc3 n1 n2 n3
# <int> <chr> <dbl> <dbl> <dbl> <int> <int> <int>
#1 1 1, 4 50 25 50 2 1 2
It can be further modified to format the output
library(tidyr) # 0.8.3.9000
out %>%
pivot_longer(cols = perc1:n3, names_to = c( "perc", "n"),
names_sep = "(?<=[a-z])(?=[0-9])") %>%
group_by(STUDY, ID, n) %>%
summarise(value = sprintf("%d (%d%%)", last(value), first(value))) %>%
select(-n)

Resources