Calculate and add percent columns to multiple columns inside a function - r

I have a large survey data-set to summarise, I have calculated row counts across multiple columns grouped by treatment and control conditions.
I need to add columns that calculate the percentage for each group and a 'difference' column
(percentage gp1) - (percentage gp2) but can't work out how to do it. Help would be appreciated.
Below is an example of the data:
library(tidyverse)
library(janitor)
df <- data.frame(mhst = factor(c(0,1,1,0)),
q1 = factor(c(1, 4, 2, 2)),
q2 = factor(c(3, 4, 5, 1)),
q3 = factor(c(1, 4, 2, 5)),
q4 = factor(c(2, 1, 1, 3)),
WT1 = c(0.5, 0.3, 6, 1))
q_set_t1 <- c("q1", "q2", "q3", "q4") #choose cols to calc
wt1 <- c("WT1") # choose weights
make_output <- function(mycol, weight) { ## Make the output table for the chosen column and weight
output <- df %>% group_by_at(c("mhst", mycol)) %>%
summarise_at(weight, sum)
names(output)[2] <- "Q_set" #Set the column names
names(output)[3] <- "weighted_count"
output <- output %>% pivot_wider(names_from = mhst, values_from = weighted_count) #sets MHST nonMHST side by side
output <- output %>% janitor::adorn_totals()
output <- output %>% mutate_all(~as.character(.)) #converts to character for easier manipulation when joining vertically
output <- bind_rows(tibble("var_name"=mycol), #adds a blank row above the output table with the var name
output, #adds the output table
tibble("var_name"="")) #adds blank row at the bottom of each tibble
}
cols_output <- pmap(list(q_set_t1, wt1), ~make_output(..1,..2)) # Generates all the output tables by coercing into a lsit and feeding through the make_output function feeding
q_set_wt <- tibble() # Join them vertically
for(i in 1:length(cols_output)) {
q_set_wt <- bind_rows(q_set_wt, cols_output[[i]])
}
I have tried adding this code:
output <- output %>% mutate(percent = mycol/sum(mycol)*100)
below in the row under the pivot_wider() function but I get an error:
"Caused by error in `sum()`:
! invalid 'type' (character) of argument"
For the difference column I have tried adding
output <- output %>% mutate("0" - "1") (0 and 1 being the names of the mhst levels)
below in the row under the pivot_wider() function but I get error:
! Problem while computing `..1 = "0" - "1"`.
Caused by error in `"0" - "1"`:
! non-numeric argument to binary operator

We may modify the function as
library(purrr)
library(dplyr)
library(tidyr)
make_output <- function(mycol, weight) {
df %>%
group_by(across(c("mhst", all_of(mycol)))) %>%
summarise(across(all_of(weight), sum), .groups = "drop") %>%
rename(Q_set = 2, weighted_count = 3) %>%
group_by(Q_set) %>%
mutate(Perc = weighted_count/sum(weighted_count) * 100) %>%
ungroup %>%
pivot_wider(names_from = mhst,
values_from = weighted_count, values_fill = 0) %>%
mutate(Diff = `0` - `1`)
}
and then apply pmap
> pmap_dfr(list(q_set_t1, wt1), ~make_output(..1, ..2))
# A tibble: 15 × 5
Q_set Perc `0` `1` Diff
<fct> <dbl> <dbl> <dbl> <dbl>
1 1 100 0.5 0 0.5
2 2 14.3 1 0 1
3 2 85.7 0 6 -6
4 4 100 0 0.3 -0.3
5 1 100 1 0 1
6 3 100 0.5 0 0.5
7 4 100 0 0.3 -0.3
8 5 100 0 6 -6
9 1 100 0.5 0 0.5
10 5 100 1 0 1
11 2 100 0 6 -6
12 4 100 0 0.3 -0.3
13 2 100 0.5 0 0.5
14 3 100 1 0 1
15 1 100 0 6.3 -6.3

Related

Alternative method to count number of single occurencies across columns of interest

I would like the number of single occurrences of some rows values across different columns. I have applied the following code:
dat = data.frame()
vector <- c(1, 2, 3)
for (i in names(data)){
for (j in vector){
dat[j,i] <- length(which(data[,i] == j))
}
}
print(dat)
That return exactly the output I am looking for. Does this code contain any redundancies? Could you please some more effective alternative way with the iterative method (including for loop) or with dplyr() packages?
Thanks
Here is a short extract of the dataset I am working on.
structure(list(run_set_1 = c(3, 3, 3, 3, 3, 3), run_set_2 = c(1,
1, 1, 1, 1, 1), run_set_3 = c(2, 2, 2, 2, 2, 2)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
You could first match() each column to get the index in vector that
the column values correspond to, if any. Then tabulate() those to get the
counts, including 0s:
lapply(data, match, vector) |>
sapply(tabulate, length(vector))
#> run_set_1 run_set_2 run_set_3
#> [1,] 0 6 0
#> [2,] 0 0 6
#> [3,] 6 0 0
This can be modified to use dplyr-native iteration:
library(dplyr, warn.conflicts = FALSE)
data %>%
summarise(
across(everything(), match, vector) %>%
purrr::map_dfc(tabulate, length(vector))
)
#> # A tibble: 3 × 3
#> run_set_1 run_set_2 run_set_3
#> <int> <int> <int>
#> 1 0 6 0
#> 2 0 0 6
#> 3 6 0 0
EDIT : I added the case for a value that we expect but is missing (4 as example)
Here is the tidyverse version. I think it may be even shorter but I don't know yet.
vector = c(1:4)
library(dplyr)
library(tidyr)
data %>% pivot_longer(cols = everything()) %>%
mutate(value = factor(as.character(value), levels = vector)) %>%
count(name, value, .drop = FALSE) %>%
pivot_wider(names_from = name, values_from = n) %>%
arrange(value) %>% select(-value)
# last line only to remove the value column and fit your example
# # A tibble: 3 × 3
# run_set_1 run_set_2 run_set_3
# <int> <int> <int>
# 1 0 6 0
# 2 0 0 6
# 3 6 0 0
# 4 0 0 0

Count cumulative and sequential values of the same sign in R

I'm looking for the equivalent code in R to this post in python for adding a column that cumulative counts the number of positives and negative values in the preceeding column.
I've found many examples of cumulative sums or something more complex, but I would just like to count the number of positives and negatives in a row that resets whenever the sign changes. See sample code.
library(dplyr)
df <- data.frame(x = c(0.5, 1, 6.5, -2, 3, -0.2, -1))
My expected output is this:
df <- data.frame(x = c(0.5, 1, 6.5, -2, 3, -0.2, -1),
z = c(1,2,3,-1,1,-1,-2))
I would like R to create column "z" with a mutate function to the dataframe df when it starts with just "x".
You can try:
library(dplyr)
df %>%
mutate(z = with(rle(sign(x)), sequence(lengths) * rep(values, lengths)))
x z
1 0.5 1
2 1.0 2
3 6.5 3
4 -2.0 -1
5 3.0 1
6 -0.2 -1
7 -1.0 -2
You may want to consider how zeroes should be treated as the above may need a modification if zeroes exist in your vector. Perhaps:
df %>%
mutate(z = with(rle(sign(x)), sequence(lengths) * rep(values^(values != 0), lengths)))
Edit addressing OP comment below:
df %>%
mutate(z = with(tmp <- rle(sign(x)), sequence(lengths) * rep(values, lengths)),
id = with(tmp, rep(seq_along(lengths), lengths))) %>%
group_by(id) %>%
mutate(avg = cumsum(x)/row_number()) %>%
ungroup() %>%
select(-id)
# A tibble: 7 x 3
x z avg
<dbl> <dbl> <dbl>
1 0.5 1 0.5
2 1 2 0.75
3 6.5 3 2.67
4 -2 -1 -2
5 3 1 3
6 -0.2 -1 -0.2
7 -1 -2 -0.6

Replacing all values in a column after a value < 1 with zeros in R

Suppose I have a time-series of number of cases generated by a transmission model, with each unique model labelled separately
trial.df <- data.frame(
model = rep(1:3, each =5),
time = rep(1:5, 3),
cases= rnorm(15, mean = 1, sd = 0.2)
)
In the real world, once you get to < 1 case, the disease should go extinct, but due to the way the model code works, you can still have cases > 1 even after they drop below 1. How can I force all values that come after a value < 1 to be zero?
I tried filtering the first time point that a zero appears for each model and then replacing all values that appear after that time point with zero, but it doesn't work with my current code
trial.df %>%
group_by(model) %>%
filter(floor(cases) == 0) %>%
slice(1) %>%
ungroup() -> rows
for (i in nrow(rows)){
trial.df %>%
group_by(model) %>%
mutate(cases2= case_when(
model == rows$model[i] & time >= rows$time[i] ~ 0,
TRUE ~ cases
)) %>% print(n = Inf)
}
I also tried using replace() to do this, but this doesn't consider the new replaced values, only those in the original data
trial.df %>%
group_by(model) %>%
mutate(cases2 = replace(cases, floor(lag(cases)) == 0, 0))
I get the feeling this should be relatively straightforward, but can't seem to get it to work. Would really appreciate the advice!
Thanks
You can use dplyr::cumany:
set.seed(42)
trial.df <- data.frame(
model = rep(1:3, each =5),
time = rep(1:5, 3),
cases= rnorm(15, mean = 1, sd = 0.2)
)
trial.df %>%
group_by(model) %>%
mutate(cases2 = if_else(cumany(cases < 1), 0, cases)) %>%
ungroup()
# # A tibble: 15 x 4
# model time cases cases2
# <int> <int> <dbl> <dbl>
# 1 1 1 1.27 1.27
# 2 1 2 0.887 0
# 3 1 3 1.07 0
# 4 1 4 1.13 0
# 5 1 5 1.08 0
# 6 2 1 0.979 0
# 7 2 2 1.30 0
# 8 2 3 0.981 0
# 9 2 4 1.40 0
# 10 2 5 0.987 0
# 11 3 1 1.26 1.26
# 12 3 2 1.46 1.46
# 13 3 3 0.722 0
# 14 3 4 0.944 0
# 15 3 5 0.973 0
Thanks for that response Henrik, it worked brilliantly! Don't know why I can't see your answer now, so sharing it so others can learn from it.
Using the cummin() function
x = c(2, 1, 0.5, 0, 1); x * cummin(x >= 1)
trial.df %>%
group_by(model) %>%
mutate(cases2= cases*cummin(cases>=1)) %>% print(n = Inf)
I was sure there was a simple solution to it. I should really understand the use of the cum..() functions more! Thanks again!

Interpolation of values from list

I have a dataframe containing the results of a competition. In this example competitors b and c have tied for second place. The actual dataframe is very large and could contain multiple ties.
df <- data.frame(name = letters[1:4],
place = c(1, 2, 2, 4))
I also have point values for the respective places, where first place gets 4 points, 2nd gets 3, 3rd gets 1 and 4th gets 0.
points <- c(4, 3, 1, 0)
names(points) <- 1:4
I can match points to place to get each competitor's score
df %>%
mutate(score = points[place])
name place score
1 a 1 4
2 b 2 3
3 c 2 3
4 d 4 0
What I would like to do though is award points to b and c that are the mean of the point values for 2nd and 3rd, such that each receives 2 points like this:
name place score
1 a 1 4
2 b 2 2
3 c 2 2
4 d 4 0
How can I accomplish this programmatically?
A solution using nested data frames and purrr.
library(dplyr)
library(tidyr)
library(purrr)
df <- data.frame(name = letters[1:4],
place = c(1, 2, 2, 4))
points <- c(4, 3, 1, 0)
names(points) <- 1:4
# a function to help expand the dataframe based on the number of ties
expand_all <- function(x,n){
x:(x+n-1)
}
df %>%
group_by(place) %>%
tally() %>%
mutate(new_place = purrr::map2(place,n, expand_all)) %>%
unnest(new_place) %>%
mutate(score = points[new_place]) %>%
group_by(place) %>%
summarize(score = mean(score)) %>%
inner_join(df)
Robert Wilson's answer gave me an idea. Rather than mapping over nested dataframes the rank function from base can get to the same result
df %>%
mutate(new_place = rank(place, ties.method = "first")) %>%
mutate(score = points[new_place]) %>%
group_by(place) %>%
summarize(score = mean(score)) %>%
inner_join(df)
place score name
<dbl> <dbl> <chr>
1 1 4 a
2 2 2 b
3 2 2 c
4 4 0 d
This can be accomplished in few lines with an ifelse() statement inside of a mutate():
df %>%
group_by(place) %>%
mutate(n_ties = n()) %>%
ungroup %>%
mutate(score = (points[place] + ifelse(n_ties > 1, 1, 0))/ n_ties)
# A tibble: 4 x 4
name place n_ties score
<chr> <dbl> <int> <dbl>
1 a 1 1 4
2 b 2 2 2
3 c 2 2 2
4 d 4 1 0

How can I create a column that cumulatively adds the sum of two previous rows based on conditions?

I tried asking this question before but was it was poorly stated. This is a new attempt cause I haven't solved it yet.
I have a dataset with winners, losers, date, winner_points and loser_points.
For each row, I want two new columns, one for the winner and one for the loser that shows how many points they have scored so far (as both winners and losers).
Example data:
winner <- c(1,2,3,1,2,3,1,2,3)
loser <- c(3,1,1,2,1,1,3,1,2)
date <- c("2017-10-01","2017-10-02","2017-10-03","2017-10-04","2017-10-05","2017-10-06","2017-10-07","2017-10-08","2017-10-09")
winner_points <- c(2,1,2,1,2,1,2,1,2)
loser_points <- c(1,0,1,0,1,0,1,0,1)
test_data <- data.frame(winner, loser, date = as.Date(date), winner_points, loser_points)
I want the output to be:
winner_points_sum <- c(0, 0, 1, 3, 1, 3, 5, 3, 5)
loser_points_sum <- c(0, 2, 2, 1, 4, 5, 4, 7, 4)
test_data <- data.frame(winner, loser, date = as.Date(date), winner_points, loser_points, winner_points_sum, loser_points_sum)
How I've solved it thus far is to do a for loop such as:
library(dplyr)
test_data$winner_points_sum_loop <- 0
test_data$loser_points_sum_loop <- 0
for(i in row.names(test_data)) {
test_data[i,]$winner_points_sum_loop <-
(
test_data %>%
dplyr::filter(winner == test_data[i,]$winner & date < test_data[i,]$date) %>%
dplyr::summarise(points = sum(winner_points, na.rm = TRUE))
+
test_data %>%
dplyr::filter(loser == test_data[i,]$winner & date < test_data[i,]$date) %>%
dplyr::summarise(points = sum(loser_points, na.rm = TRUE))
)
}
test_data$winner_points_sum_loop <- unlist(test_data$winner_points_sum_loop)
Any suggestions how to tackle this problem? The queries take quite some time when the row numbers add up. I've tried elaborating with the AVE function, I can do it for one column to sum a players point as winner but can't figure out how to add their points as loser.
winner <- c(1,2,3,1,2,3,1,2,3)
loser <- c(3,1,1,2,1,1,3,1,2)
date <- c("2017-10-01","2017-10-02","2017-10-03","2017-10-04","2017-10-05","2017-10-06","2017-10-07","2017-10-08","2017-10-09")
winner_points <- c(2,1,2,1,2,1,2,1,2)
loser_points <- c(1,0,1,0,1,0,1,0,1)
test_data <- data.frame(winner, loser, date = as.Date(date), winner_points, loser_points)
library(dplyr)
library(tidyr)
test_data %>%
unite(winner, winner, winner_points) %>% # unite winner columns
unite(loser, loser, loser_points) %>% # unite loser columns
gather(type, pl_pts, winner, loser, -date) %>% # reshape
separate(pl_pts, c("player","points"), convert = T) %>% # separate columns
arrange(date) %>% # order dates (in case it's not)
group_by(player) %>% # for each player
mutate(sum_points = cumsum(points) - points) %>% # get points up to that date
ungroup() %>% # forget the grouping
unite(pl_pts_sumpts, player, points, sum_points) %>% # unite columns
spread(type, pl_pts_sumpts) %>% # reshape
separate(loser, c("loser", "loser_points", "loser_points_sum"), convert = T) %>% # separate columns and give appropriate names
separate(winner, c("winner", "winner_points", "winner_points_sum"), convert = T) %>%
select(winner, loser, date, winner_points, loser_points, winner_points_sum, loser_points_sum) # select the order you prefer
# # A tibble: 9 x 7
# winner loser date winner_points loser_points winner_points_sum loser_points_sum
# * <int> <int> <date> <int> <int> <int> <int>
# 1 1 3 2017-10-01 2 1 0 0
# 2 2 1 2017-10-02 1 0 0 2
# 3 3 1 2017-10-03 2 1 1 2
# 4 1 2 2017-10-04 1 0 3 1
# 5 2 1 2017-10-05 2 1 1 4
# 6 3 1 2017-10-06 1 0 3 5
# 7 1 3 2017-10-07 2 1 5 4
# 8 2 1 2017-10-08 1 0 3 7
# 9 3 2 2017-10-09 2 1 5 4
I finally understood what you want. And I took an approach of getting cumulative points of each player at each point in time and then joining it to the original test_data data frame.
winner <- c(1,2,3,1,2,3,1,2,3)
loser <- c(3,1,1,2,1,1,3,1,2)
date <- c("2017-10-01","2017-10-02","2017-10-03","2017-10-04","2017-10-05","2017-10-06","2017-10-07","2017-10-08","2017-10-09")
winner_points <- c(2,1,2,1,2,1,2,1,2)
loser_points <- c(1,0,1,0,1,0,1,0,1)
test_data <- data.frame(winner, loser, date = as.Date(date), winner_points, loser_points)
library(dplyr)
library(tidyr)
cum_points <- test_data %>%
gather(end_game_status, player_id, winner, loser) %>%
gather(which_point, how_many_points, winner_points, loser_points) %>%
filter(
(end_game_status == "winner" & which_point == "winner_points") |
(end_game_status == "loser" & which_point == "loser_points")) %>%
arrange(date = as.Date(date)) %>%
group_by(player_id) %>%
mutate(cumulative_points = cumsum(how_many_points)) %>%
mutate(cumulative_points_sofar = lag(cumulative_points, default = 0))
select(player_id, date, cumulative_points)
output <- test_data %>%
left_join(cum_points, by = c('date', 'winner' = 'player_id')) %>%
rename(winner_points_sum = cumulative_points_sofar) %>%
left_join(cum_points, by = c('date', 'loser' = 'player_id')) %>%
rename(loser_points_sum = cumulative_points_sofar)
output
The difference to the previous question of the OP is that the OP is now asking for the cumulative sum of points each player has scored so far, i.e., before the actual date. Furthermore, the sample data set now contains a date column which uniquely identifies each row.
So, my previous approach can be used here as well, with some modifications. The solution below reshapes the data from wide to long format whereby two value variables are reshaped simultaneously, computes the cumulative sums for each player id , and finally reshapes from long back to wide format, again. In order to sum only points scored before the actual date, the rows are lagged by one.
It is important to note that the winner and loser columns contain the respective player ids.
library(data.table)
cols <- c("winner", "loser")
setDT(test_data)[
# reshape multiple value variables simultaneously from wide to long format
, melt(.SD, id.vars = "date",
measure.vars = list(cols, paste0(cols, "_points")),
value.name = c("id", "points"))][
# rename variable column
, variable := forcats::lvls_revalue(variable, cols)][
# order by date and cumulate the lagged points by id
order(date), points_sum := cumsum(shift(points, fill = 0)), by = id][
# reshape multiple value variables simultaneously from long to wide format
, dcast(.SD, date ~ variable, value.var = c("id", "points", "points_sum"))]
date id_winner id_loser points_winner points_loser points_sum_winner points_sum_loser
1: 2017-10-01 1 3 2 1 0 0
2: 2017-10-02 2 1 1 0 0 2
3: 2017-10-03 3 1 2 1 1 2
4: 2017-10-04 1 2 1 0 3 1
5: 2017-10-05 2 1 2 1 1 4
6: 2017-10-06 3 1 1 0 3 5
7: 2017-10-07 1 3 2 1 5 4
8: 2017-10-08 2 1 1 0 3 7
9: 2017-10-09 3 2 2 1 5 4

Resources