dplyr to calculate fraction by group - r

there are only 2 farms, but tons of fruit. trying to see which farm has been performing better over 3 years where the performance is simply farmi / (farm1 + farm2), so for the fruit==peach farm1 performance was 20% vs. farm2 80%
sample data:
df <- data.frame(fruit = c("apple", "apple", "peach", "peach", "pear", "pear", "lime", "lime"),
farm = as.factor(c(1,2,1,2,1,2,1,2)), 'y2019' = c(0,0,3,12,0,7,4,6),
'y2018' = c(5,3,0,0,8,2,0,0),'y2017' = c(4,5,7,15,0,0,0,0) )
> df
fruit farm y2019 y2018 y2017
1 apple 1 0 5 4
2 apple 2 0 3 5
3 peach 1 3 0 7
4 peach 2 12 0 15
5 pear 1 0 8 0
6 pear 2 7 2 0
7 lime 1 4 0 0
8 lime 2 6 0 0
>
desired output:
out
fruit farm y2019 y2018 y2017
1 apple 1 0.0 0.625 0.444444
2 apple 2 0.0 0.375 0.555556
3 peach 1 0.2 0.000 0.318818
4 peach 2 0.8 0.000 0.681818
5 pear 1 0.0 0.800 0.000000
6 pear 2 1.0 0.200 0.000000
7 lime 1 0.4 0.000 0.000000
8 lime 2 0.6 0.000 0.000000
>
this is a far as i could go:
df %>%
group_by(fruit) %>%
summarise(across(where(is.numeric), sum))

We can group by 'fruit', mutate across the columns that starts with 'y' to divide the elements by the sum of the values in those columns and if all values are 0, then return 0
library(dplyr)
df %>%
group_by(fruit) %>%
mutate(across(starts_with('y'), ~ if(all(. == 0)) 0 else ./sum(.)))
# A tibble: 8 x 5
# Groups: fruit [4]
# fruit farm y2019 y2018 y2017
# <chr> <fct> <dbl> <dbl> <dbl>
#1 apple 1 0 0.625 0.444
#2 apple 2 0 0.375 0.556
#3 peach 1 0.2 0 0.318
#4 peach 2 0.8 0 0.682
#5 pear 1 0 0.8 0
#6 pear 2 1 0.2 0
#7 lime 1 0.4 0 0
#8 lime 2 0.6 0 0
NOTE: Here, we just used dplyr package and it is done in a single step
Or another option is adorn_percentages from janitor
library(janitor)
library(purrr)
df %>%
group_split(fruit) %>%
map_dfr(adorn_percentages, denominator = "col") %>%
as_tibble
Or using data.table
library(data.table)
setDT(df)[, (3:5) := lapply(.SD, function(x) if(all(x == 0)) 0
else x/sum(x, na.rm = TRUE)), .SDcols = 3:5, by = fruit][]
Or using base R
grpSums <- rowsum(df[3:5], df$fruit)
df[3:5] <- df[3:5]/grpSums[match(df$fruit, row.names(grpSums)),]

We can use prop.table to calculate the proportions for each fruit.
library(dplyr)
df %>%
group_by(fruit) %>%
mutate(across(where(is.numeric), prop.table),
#to replace `NaN` with 0
across(where(is.numeric), tidyr::replace_na, 0))
# fruit farm y2019 y2018 y2017
# <chr> <fct> <dbl> <dbl> <dbl>
#1 apple 1 0 0.625 0.444
#2 apple 2 0 0.375 0.556
#3 peach 1 0.2 0 0.318
#4 peach 2 0.8 0 0.682
#5 pear 1 0 0.8 0
#6 pear 2 1 0.2 0
#7 lime 1 0.4 0 0
#8 lime 2 0.6 0 0

Related

How can I split sentence into new variables in R (with zero-one encoding)?

I have a data like below:
V1 V2
1 orange, apple
2 orange, lemon
3 lemon, apple
4 orange, lemon, apple
5 lemon
6 apple
7 orange
8 lemon, apple
I want to split the V2 variable like this:
I have three categories of the V2 column: "orange", "lemon", "apple"
for each of the categories I want to create a new column (variable) that will inform about whether such a name appeared in V2 (0,1)
I tried this
df %>% separate(V2, into = c("orange", "lemon", "apple"))
.. and I got this result, but it's not what I expect.
V1 orange lemon apple
1 1 orange apple <NA>
2 2 orange lemon <NA>
3 3 lemon apple <NA>
4 4 orange lemon apple
5 5 lemon <NA> <NA>
6 6 apple <NA> <NA>
7 7 orange <NA> <NA>
8 8 lemon apple <NA>
The result I mean is below.
V1 orange lemon apple
1 1 0 1
2 1 1 0
3 0 1 1
4 1 1 0
5 0 1 0
6 0 0 1
7 1 0 0
8 0 1 1
you could try pivoting:
library(dplyr)
library(tidyr)
df |>
separate_rows(V2, sep = ", ") |>
mutate(ind = 1) |>
pivot_wider(names_from = V2,
values_from = ind,
values_fill = 0)
Output is:
# A tibble: 8 × 4
V1 orange apple lemon
<int> <dbl> <dbl> <dbl>
1 1 1 1 0
2 2 1 0 1
3 3 0 1 1
4 4 1 1 1
5 5 0 0 1
6 6 0 1 0
7 7 1 0 0
8 8 0 1 1
data I used:
V1 <- 1:8
V2 <- c("orange, apple", "orange, lemon",
"lemon, apple", "orange, lemon, apple",
"lemon", "apple", "orange",
"lemon, apple")
df <- tibble(V1, V2)
We may use dummy_cols
library(stringr)
library(fastDummies)
library(dplyr)
dummy_cols(df, "V2", split = ",\\s+", remove_selected_columns = TRUE) %>%
rename_with(~ str_remove(.x, '.*_'))
-output
# A tibble: 8 × 4
V1 apple lemon orange
<int> <int> <int> <int>
1 1 1 0 1
2 2 0 1 1
3 3 1 1 0
4 4 1 1 1
5 5 0 1 0
6 6 1 0 0
7 7 0 0 1
8 8 1 1 0

Calculate proportion of several binary variables by another variable

I have data with several binary variables, and I want to calculate the proportion of each one, by another variable.
Example
I survey people and ask them: Please mark which of the following fruits you like (can mark more than one choice): ☐ Banana ☐ Apple ☐ Orange ☐ Strawberry ☐ Peach
Each person who checked the box gets 1 in the data, and when leaving blank it's denoted as 0. The data looks like that:
library(dplyr)
set.seed(2021)
my_df <-
matrix(rbinom(n = 100, size = 1, prob = runif(1)), ncol = 5) %>%
as.data.frame() %>%
cbind(1:20, ., sample(c("male", "female"), size = 20, replace = T)) %>%
setNames(c("person_id", "banana", "apple", "orange", "strawberry", "peach", "gender"))
my_df
#> person_id banana apple orange strawberry peach gender
#> 1 1 1 1 1 0 0 female
#> 2 2 1 0 0 0 1 female
#> 3 3 0 0 1 0 1 female
#> 4 4 1 1 0 1 0 female
#> 5 5 1 1 1 0 0 male
#> 6 6 1 1 1 0 1 female
#> 7 7 0 1 0 1 1 male
#> 8 8 1 1 0 0 0 male
#> 9 9 1 1 1 0 0 female
#> 10 10 0 0 0 0 0 male
#> 11 11 1 1 1 1 1 male
#> 12 12 1 1 0 0 1 male
#> 13 13 1 1 0 1 0 male
#> 14 14 1 1 0 0 0 male
#> 15 15 0 0 0 0 1 male
#> 16 16 0 1 0 0 1 male
#> 17 17 1 0 0 0 1 male
#> 18 18 1 1 1 1 1 male
#> 19 19 0 0 1 1 1 female
#> 20 20 0 0 0 0 0 female
Created on 2021-02-01 by the reprex package (v0.3.0)
I want to get the proportion for each fruit, split by gender. From this answer I learned how to do it for one variable (for example, banana):
my_df %>%
group_by(gender) %>%
summarise(n_of_observations = n(), prop = sum(banana == 1)/n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 3
## gender n_of_observations prop
## <chr> <int> <dbl>
## 1 female 10 0.6
## 2 male 10 0.4
But how can I get such a table for all fruits?
Desired output:
## fruit gender prop
## <chr> <chr> <dbl>
## 1 banana female 0.6
## 2 banana male 0.4
## 3 apple female 0.4
## 4 apple male 0.3
## 5 orange female 0.3
## 6 orange male 0.1
## 7 strawberry female 0.4
## 8 strawberry male 0.4
## 9 peach female 0.3
## 10 peach male 0.6
I'm looking for a dplyr solution, if possible. Thanks a lot!
You can use across to summarize multiple variables at once:
my_df %>%
group_by(gender) %>%
summarise(across(banana:peach, list(n = ~length(.x), prop = ~sum(.x == 1) / n())))
# A tibble: 2 x 11
gender banana_n banana_prop apple_n apple_prop orange_n orange_prop strawberry_n strawberry_prop peach_n peach_prop
<chr> <int> <dbl> <int> <dbl> <int> <dbl> <int> <dbl> <int> <dbl>
1 female 8 0.625 8 0.5 8 0.625 8 0.25 8 0.5
2 male 12 0.667 12 0.75 12 0.25 12 0.333 12 0.583
Note that the first argument of across specifies the variables you want to summarize. Here, I wrote banana:peach meaning all columns between banana and peach.
You can use tidyr to pivot your data first and then summarize it:
library(tidyr)
tidyr::pivot_longer(my_df, banana:peach,
names_to = "fruit") %>%
dplyr::group_by(gender, fruit) %>%
dplyr::summarize(prop = sum(value) / n())
gender fruit prop
<chr> <chr> <dbl>
1 female apple 0.5
2 female banana 0.625
3 female orange 0.625
4 female peach 0.5
5 female strawberry 0.25
6 male apple 0.75
7 male banana 0.667
8 male orange 0.25
9 male peach 0.583
10 male strawberry 0.333
You can pipe it to arrange if you want to sort by fruit. You can also add the number of observations in the summarize function with n = n().

replace zeros with NA conditionally in

In the below df, there are only 2 farms, so each fruit is duplicated. i'd like to replace zeros with NA as follows
df[df==0] <- NA
However, whenever there is a value for either of the fruits, such as for a pear at y2019 with values c(0, 7), i'd like not to replace 0. dplyr solution would be great.
sample data:
df <- data.frame(fruit = c("apple", "apple", "peach", "peach", "pear", "pear", "lime", "lime"),
farm = as.factor(c(1,2,1,2,1,2,1,2)), 'y2019' = c(0,0,3,12,0,7,4,6),
'y2018' = c(5,3,0,0,8,2,0,0),'y2017' = c(4,5,7,15,0,0,0,0) )
> df
fruit farm y2019 y2018 y2017
1 apple 1 0 5 4
2 apple 2 0 3 5
3 peach 1 3 0 7
4 peach 2 12 0 15
5 pear 1 0 8 0
6 pear 2 7 2 0
7 lime 1 4 0 0
8 lime 2 6 0 0
library(dplyr)
df %>%
group_by(fruit) %>%
mutate_at(vars(starts_with("y20")), ~ if (any(. != 0)) . else NA_real_) %>%
ungroup()
# # A tibble: 8 x 5
# fruit farm y2019 y2018 y2017
# <chr> <fct> <dbl> <dbl> <dbl>
# 1 apple 1 NA 5 4
# 2 apple 2 NA 3 5
# 3 peach 1 3 NA 7
# 4 peach 2 12 NA 15
# 5 pear 1 0 8 NA
# 6 pear 2 7 2 NA
# 7 lime 1 4 NA NA
# 8 lime 2 6 NA NA

Replace NA conditionally

this an augmented version of my own question as i could not clearly explain it through the comments
There are only 2 farms, so each fruit is duplicated in the below df. i'd like to replace NA with 0 only if there is a value for either of the fruits, such as for a pear at y2019 with values c(NA, 7), i'd like to output c(0,7) instead.
sample data:
df <- data.frame(fruit = c("apple", "apple", "peach", "peach", "pear", "pear", "lime", "lime"),
farm = as.factor(c(1,2,1,2,1,2,1,2)), 'y2019' = c(NA,NA,3,12,NA,7,4,6),
'y2018' = c(5,3,NA,NA,8,2,NA,NA),'y2017' = c(4,5,7,15,NA,NA,1,NA))
> df
fruit farm y2019 y2018 y2017
1 apple 1 NA 5 4
2 apple 2 NA 3 5
3 peach 1 3 NA 7
4 peach 2 12 NA 15
5 pear 1 NA 8 NA
6 pear 2 7 2 NA
7 lime 1 4 NA 1
8 lime 2 6 NA NA
this is close
df %>%
group_by(fruit) %>%
mutate(across(where(is.numeric), ~ if (any(is.na(.))) 0 else .)) %>%
ungroup()
but :
7 gets wiped out in pear producing c(0,0).
i'd like to leave NA in when both farms are NA
#A tibble: 8 x 5
fruit farm y2019 y2018 y2017
<chr> <fct> <dbl> <dbl> <dbl>
1 apple 1 0 5 4
2 apple 2 0 3 5
3 peach 1 3 0 7
4 peach 2 12 0 15
5 pear 1 0 8 0
6 pear 2 0 2 0
7 lime 1 4 0 0
8 lime 2 6 0 0
desired outcome:
> df
fruit farm y2019 y2018 y2017
1 apple 1 NA 5 4
2 apple 2 NA 3 5
3 peach 1 3 NA 7
4 peach 2 12 NA 15
5 pear 1 0 8 NA
6 pear 2 7 2 NA
7 lime 1 4 NA 1
8 lime 2 6 NA 0
You can try :
library(dplyr)
df %>%
group_by(fruit) %>%
mutate(across(where(is.numeric), ~ if(any(!is.na(.)))
replace(., is.na(.), 0) else .)) %>%
ungroup()
# A tibble: 8 x 5
# fruit farm y2019 y2018 y2017
# <chr> <fct> <dbl> <dbl> <dbl>
#1 apple 1 NA 5 4
#2 apple 2 NA 3 5
#3 peach 1 3 NA 7
#4 peach 2 12 NA 15
#5 pear 1 0 8 NA
#6 pear 2 7 2 NA
#7 lime 1 4 NA 1
#8 lime 2 6 NA 0
So we replace NA to 0 only if there is any value in the group which is not NA.
We can use replace_na from tidyr if there are any non-NA elements to replace with 0 or else return the value
library(dplyr)
library(tidyr)
df %>%
group_by(fruit) %>%
mutate(across(where(is.numeric), ~ if(any(!is.na(.))) replace_na(., 0) else .)) %>%
ungroup()
# A tibble: 8 x 5
# fruit farm y2019 y2018 y2017
# <chr> <fct> <dbl> <dbl> <dbl>
#1 apple 1 NA 5 4
#2 apple 2 NA 3 5
#3 peach 1 3 NA 7
#4 peach 2 12 NA 15
#5 pear 1 0 8 NA
#6 pear 2 7 2 NA
#7 lime 1 4 NA 1
#8 lime 2 6 NA 0
or another option without if/else by having two logical expressions in replace after doing the group by 'fruit'
df %>%
group_by(fruit) %>%
mutate(across(where(is.numeric),
~ replace(., sum(!is.na(.)) > 0 & is.na(.), 0)))
# A tibble: 8 x 5
# Groups: fruit [4]
# fruit farm y2019 y2018 y2017
# <chr> <fct> <dbl> <dbl> <dbl>
#1 apple 1 NA 5 4
#2 apple 2 NA 3 5
#3 peach 1 3 NA 7
#4 peach 2 12 NA 15
#5 pear 1 0 8 NA
#6 pear 2 7 2 NA
#7 lime 1 4 NA 1
#8 lime 2 6 NA 0

For loop with two variables in R

I'm constructing a panel dataset, which is going well. I can't get over this problem where I want to create some variables out of another dataframe.
I'm pretty sure I need the for-loop but can't find the solution for this specific situation.
I have these two dataframes:
name <- c("apple", "apple", "apple", "orange", "orange", "orange", "orange","orange")
day <- c(1,8,9,0,2,2,2,7)
score <- c(7,7,8,1,5,8,4,4)
df1 <- data.frame(name, day, score)
&
name1 <- c("apple", "apple", "apple", "apple", "apple", "apple", "apple", "apple", "apple", "apple", "apple", "orange", "orange", "orange", "orange","orange", "orange", "orange", "orange","orange", "orange","orange")
day1 <- c(0,1,2,3,4,5,6,7,8,9,10,0,1,2,3,4,5,6,7,8,9,10)
volume_day <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
volume_day_cum <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
avg_score_day <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
avg_score_cum <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
var_day <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
var_cum <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
df2 <- data.frame(name1, day1, volume_day, volume_day_cum, avg_score_day, avg_score_cum, var_day, var_cum)
I have a panel dataset on name-day level. Therefore, the instances of df1, which are given scores per row, needs to be coded in df2 for matching name and day. If there is no match, the 0 can stay. I'm looking for the instances itself (volume), the average scores and variation per day and cumulatively for all three variables. The resulting dataframe should look like this:
volume_day <- c(0,1,0,0,0,0,0,0,1,1,0,1,0,3,0,0,0,0,1,0,0,0)
volume_day_cum <- c(0,1,1,1,1,1,1,1,2,3,3,1,1,4,4,4,4,4,5,5,5,5)
avg_score_day <- c(0,7,0,0,0,0,0,0,7,8,0,1,0,5.66,0,0,0,0,4,0,0,0)
avg_score_cum <- c(0,7,7,7,7,7,7,7,7,7.33,7.33,1,1,4.5,4.5,4.5,4.5,4.5,4.4,4.4,4.4,4.4)
var_day <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,2.88,0,0,0,0,0,0,0,0)
var_cum <- c(0,0,0,0,0,0,0,0,0,0.22,0.22,0,0,6.25,6.25,6.25,6.25,6.25,5.04,5.04,5.04,5.04)
resultdata <- data.frame(name1, day1, volume_day, volume_day_cum, avg_score_day, avg_score_cum, var_day, var_cum)
I'm relatively new to R and coding in general. If I have insufficiently described my issue just let me know. Hopefully someone can help me out here.
There are some inconsistencies between your df1 and your resultdata, but here's a shot:
library(dplyr)
# library(zoo)
df1 %>%
group_by(name, day) %>%
summarize(
volume_day = as.numeric(n()),
var_day = var(score),
avg_score_day = mean(score),
score = sum(score)
) %>%
ungroup() %>%
full_join(select(df2, name=name1, day=day1), by = c("name", "day")) %>%
arrange(name, day) %>%
group_by(name) %>%
mutate_at(vars(volume_day, score, avg_score_day, var_day), ~ if_else(is.na(.), 0, .)) %>%
mutate(
volume_day_cum = cumsum(volume_day),
avg_score_cum = if_else(cumsum(score) == 0, 0, cumsum(score) / volume_day_cum),
var_cum = zoo::rollapply(score, n(), var, partial = TRUE)
) %>%
print(n=99)
# # A tibble: 22 x 9
# # Groups: name [2]
# name day volume_day var_day avg_score_day score volume_day_cum avg_score_cum var_cum
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 apple 0 0 0 0 0 0 0 8.17
# 2 apple 1 1 0 7 7 1 7 7
# 3 apple 2 0 0 0 0 1 7 6.12
# 4 apple 3 0 0 0 0 1 7 9.53
# 5 apple 4 0 0 0 0 1 7 12.6
# 6 apple 5 0 0 0 0 1 7 11.8
# 7 apple 6 0 0 0 0 1 7 12.6
# 8 apple 7 0 0 0 0 1 7 11
# 9 apple 8 1 0 7 7 2 7 12.1
# 10 apple 9 1 0 8 8 3 7.33 13.5
# 11 apple 10 0 0 0 0 3 7.33 15.1
# 12 orange 0 1 0 1 1 1 1 47.2
# 13 orange 1 0 0 0 0 1 1 40.6
# 14 orange 2 3 4.33 5.67 17 4 4.5 35.1
# 15 orange 3 0 0 0 0 4 4.5 31.5
# 16 orange 4 0 0 0 0 4 4.5 28.6
# 17 orange 5 0 0 0 0 4 4.5 26.2
# 18 orange 6 0 0 0 0 4 4.5 29.0
# 19 orange 7 1 0 4 4 5 4.4 32
# 20 orange 8 0 0 0 0 5 4.4 2
# 21 orange 9 0 0 0 0 5 4.4 2.29
# 22 orange 10 0 0 0 0 5 4.4 2.67

Resources