Summarizing by group of two rows - r

I have a data frame that I want to group by two variables, and then summarize the total and average.
I tried this on my data, which is correct.
df %>%
group_by(date, group) %>%
summarise(
weight = sum(ind_weigh) ,
total_usage = sum(total_usage_min) ,
Avg_usage = total_usage / weight) %>%
ungroup()
It returns this data frame:
df <- tibble::tribble(
~date, ~group, ~weight, ~total_usage, ~Avg_usage,
20190201, 0, 450762, 67184943, 149,
20190201, 1, 2788303, 385115718, 138,
20190202, 0, 483959, 60677765, 125,
20190202, 1, 2413699, 311226351, 129,
20190203, 0, 471189, 59921762, 127,
20190203, 1, 2143811, 277425186, 129,
20190204, 0, 531020, 83695977, 158,
20190204, 1, 2640087, 403200829, 153
)
I am wondering how can I add another variable in my script to get the avg_usage_total(for both group 0 and group 1) as well.
Expected result:
ex, first row --> (67184943 / (450762 + 2788303) = 20.7
date group rech total_usage Avg_usage Avg_usage_total
20190201 0 450762 67184943 149 20.7
20190201 1 2788303 385115718 138 118.9

You can do that using mutate and group_by if necessary.
library(tidyverse)
# generate dataset
(df <- tibble(
date = c(rep(Sys.Date(), 10), rep(Sys.Date() - 1, 10)),
group = rbinom(20, 1, 0.5),
rech = runif(20),
weight = runif(20),
total_usage = runif(20)
))
# A tibble: 20 x 5
date group rech weight total_usage
<date> <int> <dbl> <dbl> <dbl>
1 2019-03-10 0 0.985 0.831 0.963
2 2019-03-10 1 0.178 0.990 0.676
3 2019-03-10 1 0.505 0.697 0.152
4 2019-03-10 1 0.416 0.165 0.824
5 2019-03-10 0 0.554 0.790 0.974
# step 1 of analysis
(df <- df %>%
group_by(date, group) %>%
summarise(rech = sum(rech),
weight = sum(weight),
total_usage = sum(total_usage)) %>%
mutate(Avg_usage = total_usage / weight))
# A tibble: 4 x 6
# Groups: date [2]
date group rech weight total_usage Avg_usage
<date> <int> <dbl> <dbl> <dbl> <dbl>
1 2019-03-09 0 3.29 4.82 3.03 0.628
2 2019-03-09 1 1.45 1.22 1.16 0.954
3 2019-03-10 0 1.54 1.62 1.94 1.20
4 2019-03-10 1 3.15 4.55 4.63 1.02
# step 2 of analysis
df %>%
group_by(date) %>% # only necessary if you want to compute Avg_usage_total by date
mutate(Avg_usage_total = total_usage / sum(rech)) %>% # total_usage is taken by row, sum is taken for the entire column
ungroup()
# A tibble: 4 x 7
date group rech weight total_usage Avg_usage Avg_usage_total
<date> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2019-03-09 0 3.29 4.82 3.03 0.628 0.639
2 2019-03-09 1 1.45 1.22 1.16 0.954 0.246
3 2019-03-10 0 1.54 1.62 1.94 1.20 0.413
4 2019-03-10 1 3.15 4.55 4.63 1.02 0.986

Related

In R , there are `actual` and `budget` values,how to add new variable and calculate the variable values

In variable type ,there are actual and budget values,how to add new variable and calculate the variable value ? Current code can work, but a little bording. Anyone can help? Thanks!
ori_data <- data.frame(
category=c("A","A","A","B","B","B"),
year=c(2021,2022,2022,2021,2022,2022),
type=c("actual","actual","budget","actual","actual","budget"),
sales=c(100,120,130,70,80,90),
profit=c(3.7,5.52,5.33,2.73,3.92,3.69)
)
Add sales inc%
ori_data$sales_inc_or_budget_acheved[category=='A'&year=='2022'&type=='actual'] <-
ori_data$sales[category=='A'&year=='2022'&type=='actual']/
ori_data$sales[category=='A'&year=='2021'&type=='actual']-1
Add budget acheved%
ori_data$sales_inc_or_budget_acheved[category=='A'&year=='2022'&type=='budget'] <-
ori_data$sales[category=='A'&year=='2022'&type=='actual']/
ori_data$sales[category=='A'&year=='2022'&type=='budget']
Using a group_by and an if_elseyou could do:
library(dplyr)
ori_data |>
group_by(category) |>
arrange(category, type, year) |>
mutate(sales_inc_or_budget_achieved = if_else(type == "actual",
sales / lag(sales) - 1,
lag(sales) / sales)) |>
ungroup()
#> # A tibble: 6 × 6
#> category year type sales profit sales_inc_or_budget_achieved
#> <chr> <dbl> <chr> <dbl> <dbl> <dbl>
#> 1 A 2021 actual 100 3.7 NA
#> 2 A 2022 actual 120 5.52 0.2
#> 3 A 2022 budget 130 5.33 0.923
#> 4 B 2021 actual 70 2.73 NA
#> 5 B 2022 actual 80 3.92 0.143
#> 6 B 2022 budget 90 3.69 0.889
And using across you could do the same for both sales and profit:
ori_data |>
group_by(category) |>
arrange(category, type, year) |>
mutate(across(c(sales, profit), ~ if_else(type == "actual",
.x / lag(.x) - 1,
lag(.x) / .x),
.names = "{.col}_inc_or_budget_achieved")) |>
ungroup()
#> # A tibble: 6 × 7
#> category year type sales profit sales_inc_or_budget_achie… profit_inc_or_b…
#> <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 A 2021 actual 100 3.7 NA NA
#> 2 A 2022 actual 120 5.52 0.2 0.492
#> 3 A 2022 budget 130 5.33 0.923 1.04
#> 4 B 2021 actual 70 2.73 NA NA
#> 5 B 2022 actual 80 3.92 0.143 0.436
#> 6 B 2022 budget 90 3.69 0.889 1.06
Answer from stefan suits perfectly well, however, I would suggest you rearrange your data first.
In my opinion sales and profit are types of measures (aka observations) and actual and budget are the measurements here:
library(tidyr)
library(dplyr)
ori_data2 <-
ori_data %>%
pivot_longer(c(sales, profit)) %>%
pivot_wider(names_from = type, values_from = value) %>%
group_by(category, name) %>%
arrange(year, .by_group = TRUE)
then your calculations become much more easier:
ori_data2 %>%
mutate(increase = actual / lag(actual) - 1, # compare to the year before
budget_acheved = actual / budget) %>% # compare actual vs. budget
filter(year == 2022) # you can filter for year of interest
mutate(across(c(increase, budget_acheved), scales::percent)) # and format as percent

Choose dataframe variables by name and multiply with a vector elementwise

I have a data frame and a vector as follows:
my_df <- as.data.frame(
list(year = c(2001, 2001, 2001, 2001, 2001, 2001), month = c(1,
2, 3, 4, 5, 6), Pdt_d0 = c(0.379045935402736, 0.377328817455841,
0.341158889847019, 0.36761990427443, 0.372442657083218, 0.382702189949558
), Pdt_d1 = c(0.146034519173855, 0.166289573095497, 0.197787188740911,
0.137071647982617, 0.162103042313547, 0.168566518193772), Pdt_d2 = c(0.126975939811326,
0.107708783271871, 0.14096203677089, 0.142228236885706, 0.115542396064519,
0.106935751726809), Pdt_tot = c(2846715, 2897849.5, 2935406.25,
2850649, 2840313.75, 3087993.5))
)
my_vec <- 1:3
I want to multiply Pdt_d0:Pdt_d2 with the corresponding element from my_vec, while keeping the other columns untouched. I can get the desired multiplication with dplyr::select(my_df, num_range("Pdt_d", 0:2)) %>% mapply(``*``, ., my_vec) but I lose the year, month, Pdt_tot columns in the process. I tried to achieve my goal with dplyr::select(my_df, num_range("Pdt_d", 0:2)) <- dplyr::select(my_df, num_range("Pdt_d", 0:2)) %>% mapply(``*``, ., my_vec) which returns an error 'select<-' is not an exported object. Is there an obvious trick I am not seeing?
I don't think my question is a duplicate; I have seen the answers in here and here but neither question allows me to choose variables by name
You can use the left-hand-side overwritten by the right-hand-side Map/mapply logic, which you tried, outside of the tidy world:
vars <- paste0("Pdt_d", 0:2)
my_df[vars] <- Map(`*`, my_df[vars], my_vec)
my_df
# year month Pdt_d0 Pdt_d1 Pdt_d2 Pdt_tot
#1 2001 1 0.3790459 0.2920690 0.3809278 2846715
#2 2001 2 0.3773288 0.3325791 0.3231263 2897850
#3 2001 3 0.3411589 0.3955744 0.4228861 2935406
#4 2001 4 0.3676199 0.2741433 0.4266847 2850649
#5 2001 5 0.3724427 0.3242061 0.3466272 2840314
#6 2001 6 0.3827022 0.3371330 0.3208073 3087994
This works because [<- exists as a function in R, for assigning to a left-hand-side selection by the square brackets, like my_df[].
The error that was returned is because the code has a select() function on the left-hand-side, and there is no 'select<-' function. I.e., you can't assign to a select()-ion because it isn't setup to work like that. The tidy functions are usually expected to be piped like my_df %>% select() %>% etc without overwriting the original input.
I don't think that you want to do this mess, but it does work.
library(dplyr)
library(tidyr)
my_df %>%
gather(variable, value, -year,-month,-Pdt_tot) %>%
group_by(year, month, Pdt_tot) %>%
mutate(value = value * my_vector) %>%
spread(variable,value)
year month Pdt_tot Pdt_d0 Pdt_d1 Pdt_d2
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2001 1 2846715 0.379 0.292 0.381
2 2001 2 2897850. 0.377 0.333 0.323
3 2001 3 2935406. 0.341 0.396 0.423
4 2001 4 2850649 0.368 0.274 0.427
5 2001 5 2840314. 0.372 0.324 0.347
6 2001 6 3087994. 0.383 0.337 0.321
Not specifying year, month, and Pdt_tot is,
my_df %>%
gather(variable, value, - !num_range("Pdt_d", 0:2)) %>%
group_by(across(c(-variable, -value))) %>%
mutate(value = value * my_vector) %>%
spread(variable, value)
year month Pdt_tot Pdt_d0 Pdt_d1 Pdt_d2
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2001 1 2846715 0.379 0.292 0.381
2 2001 2 2897850. 0.377 0.333 0.323
3 2001 3 2935406. 0.341 0.396 0.423
4 2001 4 2850649 0.368 0.274 0.427
5 2001 5 2840314. 0.372 0.324 0.347
6 2001 6 3087994. 0.383 0.337 0.321

reshaping rows of data to two columns

We have data on school districts where the columns are the local-specific information (e.g., free and reduced price lunch %) and the corresponding statewide values.
dat <- tribble(
~state.poverty, ~state.EL, ~state.disability, ~state.frpl, ~local.poverty, ~local.frpl, ~local.disability, ~local.EL,
12.50592, 0.08342419, 0.12321831, 0.4495395, 25.23731, 0.6415712, 0.140739, 0.1469898)
dat
# A tibble: 1 x 8
state.poverty state.EL state.disability state.frpl local.poverty local.frpl local.disability local.EL
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 12.5 0.0834 0.123 0.450 25.2 0.642 0.141 0.147
We want to reshape that so that it looks like this.
demog state local
<chr> <dbl> <dbl>
1 poverty 12.5 25.2
2 EL 0.0834 0.147
3 disability 0.123 0.141
4 frpl 0.450 0.642
It seems like something that pivot_longer should be able to handle, but I haven't had much success so far. Any suggestions?
We can use pivot_longer
library(dplyr)
library(tidyr)
dat %>%
pivot_longer(cols = everything(),
names_to = c(".value", "demog"), names_sep = "\\.")
-output
# A tibble: 4 x 3
# demog state local
# <chr> <dbl> <dbl>
#1 poverty 12.5 25.2
#2 EL 0.0834 0.147
#3 disability 0.123 0.141
#4 frpl 0.450 0.642
A base R option using reshape
reshape(
dat,
direction = "long",
varying = 1:ncol(dat)
)
gives
# A tibble: 4 x 4
time state local id
<chr> <dbl> <dbl> <int>
1 poverty 12.5 25.2 1
2 EL 0.0834 0.642 1
3 disability 0.123 0.141 1
4 frpl 0.450 0.147 1

Compare one group to the rest of the groups as a whole in R

Here is some sample data:
movie_df <- data.frame("ID" = c(1,2,3,4,5,6,7,8,9,10),
"movie_type" = c("Action", "Horror", "Comedy", "Thriller", "Comedy",
"Action","Thriller", "Horror", "Action", "Comedy"),
"snack_type" = c("Chocolate", "Popcorn", "Candy", "Popcorn", "Popcorn",
"Candy","Chocolate", "Candy", "Popcorn", "Chocolate"),
"event_type" = c("Solo", "Family", "Date", "Friends", "Solo",
"Family","Date", "Date", "Friends", "Friends"),
"total_cost" = c(50, 35, 20, 50, 30,
60, 25, 35, 20, 50))
What I want to do is go through each column and compare each group to the rest of the groups on total_cost. For example, I want to see how movie_type == 'Action' compares to movie_type != 'Action' for total_cost. I want to do that for every type in movie_type then every type in snack_type and event_type.
What I ultimately want to get to is this where sd = Standard Deviation. Ideally this will be done by a tidyverse method in R (e.g. dplyr or tidyr):
> results_df
# A tibble: 11 x 11
Group Grp_1 Grp_2 Grp_1_mean Grp_2_mean Grp_1_sd Grp_2_sd Grp_1_n Grp_2_n Mean_Diff `t-test`
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 movie_type Action Rest of group 43.3 35 20.8 11.5 3 7 8.33 2.84
2 movie_type Horror Rest of group 35 38.1 0 16.0 2 8 -3.12 -2.21
3 movie_type Thriller Rest of group 37.5 37.5 17.7 14.6 2 8 0 0
4 movie_type Comedy Rest of group 33.3 39.3 15.3 14.6 3 7 -5.95 -2.22
5 snack_type Chocolate Rest of group 41.7 35.7 14.4 14.8 3 7 5.95 2.26
6 snack_type Candy Rest of group 38.3 37.1 20.2 12.9 3 7 1.19 0.407
7 snack_type Popcorn Rest of group 33.8 40 12.5 15.8 4 6 -6.25 -2.60
8 event_type Date Rest of group 26.7 42.1 7.64 14.1 3 7 -15.5 -7.25
9 event_type Family Rest of group 47.5 35 17.7 13.4 2 8 12.5 3.86
10 event_type Friends Rest of group 40 36.4 17.3 14.1 3 7 3.57 1.28
11 event_type Solo Rest of group 40 36.9 14.1 15.1 2 8 3.12 1.04
It's same logic as Daniel did using purrr::map and purrr::map2.
library(dplyr)
library(tibble)
library(purrr)
library(stringr)
needed_cols <- c("movie_type", "snack_type", "event_type")
new_names <- 1:2 %>%
map(~str_c(c("group", "mean", "sd", "n"), "_", .x)) %>%
unlist()
my_data <- needed_cols %>%
map(function(df_c)
map(unique(movie_df[[df_c]]),
function(v){
df <- movie_df %>%
mutate(group = ifelse(get(df_c) == v, v, "rest_of_group")) %>%
group_by(group) %>%
summarize(mean = mean(total_cost), sd = sd(total_cost), n = n()) %>%
.[match(.$group, c(v, "rest_of_group")),]
df <- bind_cols(df[1, ], df[2,])
names(df) <- new_names
df
}
)
) %>%
map2(needed_cols, ~bind_rows(.x) %>% mutate(group = .y)) %>%
bind_rows() %>%
select(
str_subset(names(.), "group") %>% sort(),
str_subset(names(.), "mean"),
str_subset(names(.), "sd"),
str_subset(names(.), "n")
) %>%
mutate(mean_diff = mean_1 - mean_2)
Sorry its not in pipes, but in Base R we can:
results_df <- do.call(rbind,unlist(
apply(movie_df[,2:4],2,function(u)
lapply(unique(u), function(x)
data.frame(
group1 = as.character(x),
group2 = "rest",
grp1_mean = mean(movie_df$total_cost[u == x]),
grp2_mean = mean(movie_df$total_cost[u != x]),
grp1_sd = sd(movie_df$total_cost[u == x]),
grp2_sd = sd(movie_df$total_cost[u != x])
)
)
),recursive=F)
)
#add mean differences
results_df$meandiff <- with(results_df, grp1_mean - grp2_mean)
> results_df
group1 group2 grp1_mean grp2_mean grp1_sd grp2_sd meandiff
movie_type1 Action rest 43.33333 35.00000 20.816660 11.54701 8.333333
movie_type2 Horror rest 35.00000 38.12500 0.000000 16.02175 -3.125000
movie_type3 Comedy rest 33.33333 39.28571 15.275252 14.55695 -5.952381
movie_type4 Thriller rest 37.50000 37.50000 17.677670 14.63850 0.000000
snack_type1 Chocolate rest 41.66667 35.71429 14.433757 14.84042 5.952381
snack_type2 Popcorn rest 33.75000 40.00000 12.500000 15.81139 -6.250000
snack_type3 Candy rest 38.33333 37.14286 20.207259 12.86375 1.190476
event_type1 Solo rest 40.00000 36.87500 14.142136 15.10381 3.125000
event_type2 Family rest 47.50000 35.00000 17.677670 13.36306 12.500000
event_type3 Date rest 26.66667 42.14286 7.637626 14.09998 -15.476190
event_type4 Friends rest 40.00000 36.42857 17.320508 14.05770 3.571429

Multiple gathering in R to create tidy dataset

I have a complicated untidy dataset which a dummy version of can be replicated below.
studentID <- seq(1:250)
score2018 <- runif(250)
score2019 <- runif(250)
score2020 <- runif(250)
payment2018 <- runif(250, min=10000, max=12000)
payment2019 <- runif(250, min=11000, max=13000)
payment2020 <- runif(250, min=12000, max=14000)
attendance2018 <- runif(250, min=0.75, max=1)
attendance2019 <- runif(250, min=0.75, max=1)
attendance2020 <- runif(250, min=0.75, max=1)
untidy_df <- data.frame(studentID, score2018, score2019, score2020, payment2018, payment2019, payment2020, attendance2018, attendance2019, attendance2020)
I would like to gather this data frame so that we only have 5 columns: studentID, year, score, payment, attendance. I know how to gather at a basic level, but I have 3 sets to gather here, and I can't see how to do this in one go.
Thanks in advance!
With tidyr you can use pivot_longer:
library(tidyr)
untidy_df %>%
pivot_longer(cols = -studentID, names_to = c(".value", "year"), names_pattern = "(\\w+)(\\d{4})")
Output
# A tibble: 750 x 5
studentID year score payment attendance
<int> <chr> <dbl> <dbl> <dbl>
1 1 2018 0.432 10762. 0.786
2 1 2019 0.948 11340. 0.909
3 1 2020 0.122 12837. 0.944
4 2 2018 0.422 11515. 0.950
5 2 2019 0.0639 12968. 0.828
6 2 2020 0.611 13645. 0.901
7 3 2018 0.489 11281. 0.784
8 3 2019 0.00337 12250. 0.753
9 3 2020 0.711 12898. 0.803
10 4 2018 0.0596 10526. 0.842
Using pure R:
tidy_df <- reshape(untidy_df, direction="long", idvar="studentID", varying=2:10, sep="")
head(tidy_df)
studentID time score payment attendance
1.2018 1 2018 0.86743970 10995.45 0.9473540
2.2018 2 2018 0.53204701 11152.74 0.8167776
3.2018 3 2018 0.90072918 10631.06 0.9335316
4.2018 4 2018 0.89154492 11889.23 0.9098399
5.2018 5 2018 0.06320442 10973.20 0.8118909
6.2018 6 2018 0.67519166 11751.67 0.8328860
If you want "year" instead of the default "time", add timevar="year"
We could try:
library(dplyr)
library(tidyr)
untidy_df %>%
pivot_longer(cols = -studentID) %>%
separate(col = name, sep = "(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)", into = c("measure", "year")) %>%
pivot_wider(names_from = measure, values_from = value )
Which returns:
studentID year score payment attendance
<int> <chr> <dbl> <dbl> <dbl>
1 1 2018 0.807 10179. 0.974
2 1 2019 0.599 11601. 0.785
3 1 2020 0.515 12347. 0.760
4 2 2018 0.474 11154. 0.983
5 2 2019 0.409 11682. 0.864
6 2 2020 0.688 13756. 0.812
7 3 2018 0.509 11746. 0.870
8 3 2019 0.867 12851. 0.801
9 3 2020 0.878 12710. 0.955
10 4 2018 0.621 11165. 0.975

Resources