My data looks like this:
date schedule_id food_truck_id building_id truck_status last_confirmed_date dsle
2018-04-26 422 58 30 accepted_event 0 31
2018-04-26 422 59 30 accepted_event 2018-02-27 11
2018-04-26 422 65 30 accepted_event 2018-03-15 12
2018-04-26 422 88 30 accepted_event 2018-02-20 7
2018-04-26 422 89 30 accepted_event 2018-03-22 13
2018-04-26 422 101 30 accepted_event 2018-02-06 16
2018-04-26 422 120 30 accepted_event 2018-03-06 14
2018-04-26 422 135 30 accepted_event 2018-03-13 21
2018-04-26 399 42 33 accepted_event 2018-03-15 8
2018-04-26 399 58 33 accepted_event 0 31
2018-04-26 399 59 33 accepted_event 2018-03-01 11
2018-04-26 399 65 33 accepted_event 2018-02-27 12
2018-04-26 399 88 33 accepted_event
Can be reproduced using:
structure(list(date = structure(c(17647, 17647, 17647, 17647,
17647, 17647, 17647, 17647, 17647, 17647, 17647, 17647, 17647,
17647, 17647, 17647, 17647), class = "Date"), schedule_id = c(422L,
422L, 422L, 422L, 422L, 422L, 422L, 422L, 399L, 399L, 399L, 399L,
399L, 399L, 399L, 399L, 399L), food_truck_id = c(58L, 59L, 65L,
88L, 89L, 101L, 120L, 135L, 42L, 58L, 59L, 65L, 88L, 89L, 101L,
120L, 135L), building_id = c(30L, 30L, 30L, 30L, 30L, 30L, 30L,
30L, 33L, 33L, 33L, 33L, 33L, 33L, 33L, 33L, 33L), truck_status = c("accepted_event",
"accepted_event", "accepted_event", "accepted_event", "accepted_event",
"accepted_event", "accepted_event", "accepted_event", "accepted_event",
"accepted_event", "accepted_event", "accepted_event", "accepted_event",
"accepted_event", "accepted_event", "accepted_event", "accepted_event"
), last_confirmed_date = c("0", "2018-02-27", "2018-03-15", "2018-02-20",
"2018-03-22", "2018-02-06", "2018-03-06", "2018-03-13", "2018-03-15",
"0", "2018-03-01", "2018-02-27", "0", "2018-03-06", "2018-03-13",
"0", "2018-02-22"), dsle = c(31, 11, 12, 7, 13, 16, 14, 21, 8,
31, 11, 12, 7, 13, 16, 14, 21)), .Names = c("date", "schedule_id",
"food_truck_id", "building_id", "truck_status", "last_confirmed_date",
"dsle"), row.names = c(142L, 223L, 379L, 455L, 495L, 589L, 806L,
877L, 63L, 155L, 215L, 287L, 452L, 483L, 667L, 809L, 894L), class = "data.frame")
My goal is to only select the food_truck_id based on max(dsle) but it should be unique per date. For instance, for schedule_id 422, food_truck_id with max(dsle) is 58, it is also 58 for schedule_id 399.
What I want is, let's say for 422, it is 58, but for 399, it should be next max(dsle) other than 58.
I have tried the following but it doesn't gives what I want.
testxx %>%
group_by(schedule_id) %>%
distinct(food_truck_id, date, dsle) %>%
filter(dsle == max(dsle))
The result I want is following
date schedule_id food_truck_id
2018-04-26 422 58
2018-04-26 399 135
because 135 next to 58 has max(dsle)
Updated to account for date
This might be one of those occasions where a loop is the best/easiest solution.
However, it does a join operation in the loop, so there will be some optimisations that can be made
The idea is to loop over each schedule_id, and keep track of which food_trucks have already been used on which date.
If we do some pre-arranging of the data before the loop it makes things easier
df <- df %>%
arrange(schedule_id, -dsle)
## pre-allocate a result data.frame
ids <- unique(df$schedule_id)
df_res <- data.frame(schedule_id = ids,
food_truck_id = NA)
usedTrucks <- data.frame(date = as.Date(NA),
schedule_id = ids,
food_truck_id = NA_integer_)
counter <- 1
for(i in ids) {
possibleTrucks <- df[df$schedule_id %in% i, c("date", "food_truck_id")]
## possible Trucks will be in order, as we have pre-arranged the data
## use the first one that hasn't already been used
## on the given date
possibleTrucks <- anti_join(possibleTrucks, usedTrucks, by = c("date", "food_truck_id"))
thisTruck <- possibleTrucks[1, c("food_truck_id", "date")]
df_res[counter, 'food_truck_id'] <- thisTruck$food_truck_id
usedTrucks[counter, "food_truck_id"] <- thisTruck$food_truck_id
usedTrucks[counter, "date"] <- thisTruck$date
counter <- counter + 1
}
df_res
# schedule_id food_truck_id
# 1 399 58
# 2 422 135
If speed is an issue on a larger data set this can be re-written in Rcpp to make it much faster.
p<-df %>% arrange(desc(schedule_id), desc(dsle)) %>% slice(1) %>% select(date,dsle,schedule_id,food_truck_id)
df %>% subset(!(schedule_id%in%c(p))) %>% subset(!(dsle%in%c(p))) %>% select(date,dsle,schedule_id,food_truck_id) %>% arrange(desc(dsle)) %>% slice(1) %>%
rbind(p,.) %>% select(-dsle)
output
# A tibble: 2 x 3
date schedule_id food_truck_id
<date> <int> <int>
1 2018-04-26 422 58
2 2018-04-26 399 135
Related
My data:
c5 =structure(list(comorbid = c("heart", "ihd", "cabg", "angio",
"cerebrovasc", "diabetes", "pvd", "amputation", "liver", "malig",
"smoke", "ulcers"), AVF_Y = c(626L, 355L, 266L, 92L, 320L, 1175L,
199L, 89L, 75L, 450L, 901L, 114L), AVG_Y = c(54L, 14L, 18L, 5L,
21L, 37L, 5L, 7L, 5L, 29L, 33L, 3L), AVF_tot = c(2755L, 1768L,
2770L, 2831L, 2844L, 2877L, 1745L, 2823L, 2831L, 2823L, 2798L,
2829L), AVG_tot = c(161L, 61L, 161L, 165L, 166L, 167L, 61L, 165L,
165L, 165L, 159L, 164L)), row.names = c(NA, -12L), class = "data.frame")
I want to perform a prop.test for each row ( a two-proportions z-test) and add the p value as a new column.
I've tried using the following code, but this gives me 24 1-sample proportions test results instead of 12 2-sample test for equality of proportions.
Map(prop.test, x = c(c5$AVF_Y, c5$AVG_Y), n = c(c5$AVF_tot, c5$AVG_tot))
Use a lambda function and extract. When we concatenate the columns, it returns a vector and its length will be 2 times the number of rows of the data. We would need to concatenate within in the loop to create a vector of length 2 for each x and n from corresponding columns of '_Y', and '_tot'
mapply(function(avf, avg, avf_n, avg_n) prop.test(c(avf, avg), c(avf_n, avg_n))$p.value, c5$AVF_Y, c5$AVG_Y, c5$AVF_tot, c5$AVG_tot)
-output
[1] 2.218376e-03 6.985883e-01 6.026012e-01 1.000000e+00 6.695440e-01 2.425781e-06 5.672322e-01 5.861097e-01 9.627050e-01 6.546286e-01 3.360300e-03 2.276857e-0
Or use do.cal with Map or mapply
do.call(mapply, c(FUN = function(x, y, n1, n2)
prop.test(c(x, y), c(n1, n2))$p.value, unname(c5[-1])))
[1] 2.218376e-03 6.985883e-01 6.026012e-01 1.000000e+00 6.695440e-01 2.425781e-06 5.672322e-01 5.861097e-01 9.627050e-01 6.546286e-01 3.360300e-03 2.276857e-01
Or with apply
apply(c5[-1], 1, function(x) prop.test(x[1:2], x[3:4])$p.value)
[1] 2.218376e-03 6.985883e-01 6.026012e-01 1.000000e+00 6.695440e-01 2.425781e-06 5.672322e-01 5.861097e-01 9.627050e-01 6.546286e-01 3.360300e-03 2.276857e-01
Or use rowwise
library(dplyr)
c5 %>%
rowwise %>%
mutate(pval = prop.test(c(AVF_Y, AVG_Y),
n = c(AVF_tot, AVG_tot))$p.value) %>%
ungroup
-output
# A tibble: 12 × 6
comorbid AVF_Y AVG_Y AVF_tot AVG_tot pval
<chr> <int> <int> <int> <int> <dbl>
1 heart 626 54 2755 161 0.00222
2 ihd 355 14 1768 61 0.699
3 cabg 266 18 2770 161 0.603
4 angio 92 5 2831 165 1.00
5 cerebrovasc 320 21 2844 166 0.670
6 diabetes 1175 37 2877 167 0.00000243
7 pvd 199 5 1745 61 0.567
8 amputation 89 7 2823 165 0.586
9 liver 75 5 2831 165 0.963
10 malig 450 29 2823 165 0.655
11 smoke 901 33 2798 159 0.00336
12 ulcers 114 3 2829 164 0.228
I have data on subject codes and grades for students. Each has to take 4 subjects out of which English is mandatory. It is represented by code 301.
Columns: SUB1, SUB2, and so on represent subject codes for other modules and the next column represents the marks.
Based on these codes I am trying to do two things:
First thing:
I am trying to create a course column consisting of PCB if the student has subject codes 42, 43, and 44. PCM if the student has subject codes 41, 42, and 43. Commerce if the codes are 55, 54, and 30.
The issue that I am facing is that the codes are spread across columns and I facing difficulty to standardize them.
Second thing:
Based on the identified course, I am trying to sum the grades obtained by each student in these subjects. However, I am want to add the English grade to it as well.
Example of the data:
structure(list(Roll.No. = c(12653771L, 12653813L, 12653787L,
12653850L, 12653660L, 12653553L, 12653902L, 12653888L, 12653440L,
12653487L, 12653469L, 12653832L, 12653382L, 12653814L, 12653587L,
12653508L, 12653449L, 12653445L, 12653776L, 12653806L), SUB = c(301L,
301L, 301L, 301L, 301L, 301L, 301L, 301L, 301L, 301L, 301L, 301L,
301L, 301L, 301L, 301L, 301L, 301L, 301L, 301L), MRK = c(93L,
82L, 85L, 74L, 85L, 80L, 67L, 77L, 78L, 94L, 89L, 65L, 88L, 89L,
82L, 85L, 91L, 77L, 97L, 76L), SUB.1 = c(30L, 30L, 30L, 30L,
42L, 41L, 30L, 30L, 41L, 41L, 41L, 30L, 41L, 30L, 42L, 41L, 41L,
41L, 30L, 30L), MRK.1 = c(74L, 97L, 75L, 73L, 72L, 81L, 62L,
71L, 63L, 75L, 93L, 74L, 89L, 91L, 67L, 87L, 81L, 94L, 86L, 69L
), SUB.2 = c(48L, 41L, 48L, 54L, 43L, 42L, 48L, 48L, 42L, 42L,
42L, 41L, 42L, 41L, 43L, 42L, 42L, 42L, 48L, 48L), MRK.2 = c(76L,
95L, 79L, 74L, 72L, 75L, 67L, 74L, 60L, 72L, 93L, 56L, 79L, 68L,
68L, 91L, 62L, 75L, 95L, 67L), SUB.3 = c(54L, 54L, 54L, 55L,
44L, 43L, 54L, 54L, 43L, 43L, 43L, 54L, 43L, 54L, 44L, 43L, 43L,
43L, 54L, 54L), MRK.3 = c(80L, 96L, 77L, 44L, 94L, 69L, 63L,
74L, 57L, 67L, 80L, 67L, 72L, 89L, 95L, 87L, 68L, 82L, 94L, 69L
), SUB.4 = c(55L, 55L, 55L, 64L, 265L, 48L, 55L, 55L, 48L, 48L,
283L, 55L, 48L, 55L, 64L, 283L, 283L, 48L, 55L, 55L), MRK.4 = c(45L,
95L, 46L, 76L, 91L, 74L, 44L, 52L, 82L, 92L, 92L, 60L, 81L, 49L,
83L, 89L, 90L, 83L, 93L, 61L), SUB.5 = c(NA, 64L, NA, 41L, 48L,
NA, NA, NA, NA, NA, NA, 64L, NA, 49L, NA, 48L, 49L, NA, NA, NA
), MRK.5 = c(NA, "97", NA, "AB", "87", NA, NA, NA, NA, NA, NA,
"71", NA, "97", NA, "83", "98", NA, NA, NA)), row.names = c(NA,
20L), class = "data.frame")
This should also answer: your first question:
df <- df %>%
# convert to numeric and induce NAs in MRK.5 (losing AB)
mutate(MRK.5 = as.numeric(MRK.5)) %>%
# convert NAs to 0
mutate(across(where(anyNA), ~ replace_na(., 0))) %>%
# create a new string of subjects
mutate(subjects = str_c(SUB.1, SUB.2, SUB.3, SUB.4, SUB.5, sep = "_")) %>%
# use case_when to specify course
mutate(course = case_when(str_detect(subjects, "42") & str_detect(subjects, "43") & str_detect(subjects, "44") ~ "PCB",
str_detect(subjects, "41") & str_detect(subjects, "42") & str_detect(subjects, "43") ~ "PCM",
str_detect(subjects, "30") & str_detect(subjects, "54") & str_detect(subjects, "55") ~ "Commerce",
TRUE ~ "Other"))
I am not sure whether you want the summary scores for just the three subjects plus English in each course, or for all subjects, but with a bit of wrangling you can the following dataframe, from where you should be able to get what you need.
df %>% select(-subjects) %>%
# create consistent naming for the pivot separation
rename(SUB.E = SUB, MRK.E = MRK) %>%
# pivot into tidy form
pivot_longer(cols = -c(Roll.No., subjects, course),
names_pattern = "(\\w{3})\\.(.{1})",
names_to = c(".value", "course2"))
Giving:
A tibble: 120 × 5
Roll.No. course course2 SUB MRK
<int> <chr> <chr> <dbl> <dbl>
1 12653771 Commerce E 301 93
2 12653771 Commerce 1 30 74
3 12653771 Commerce 2 48 76
4 12653771 Commerce 3 54 80
5 12653771 Commerce 4 55 45
6 12653771 Commerce 5 0 0
7 12653813 Commerce E 301 82
8 12653813 Commerce 1 30 97
9 12653813 Commerce 2 41 95
10 12653813 Commerce 3 54 96
To calculate the summed scores per student, you then need to filter by condition and use group_by and summarise:
# filter for each condition
df %>% filter(course == "Commerce" & SUB %in% c(301,30,50,54) |
course == "PCB" & SUB %in% c(301,42,43,44) |
course == "PCM" & SUB %in% c(301,41,42,43)) %>%
# summarise by group
group_by(Roll.No.,course) %>%
summarise(score = sum(MRK))
Combining the parts above into one block of code:
df %>% mutate(across(, ~as.numeric(.))) %>%
mutate(across(where(anyNA), ~ replace_na(., 0))) %>%
rename(SUB.E = SUB, MRK.E = MRK) %>%
mutate(subjects = str_c(SUB.1, SUB.2, SUB.3, SUB.4, SUB.5, sep = "_")) %>%
mutate(course = case_when(str_detect(subjects, "42") & str_detect(subjects, "43") & str_detect(subjects, "44") ~ "PCB",
str_detect(subjects, "41") & str_detect(subjects, "42") & str_detect(subjects, "43") ~ "PCM",
str_detect(subjects, "30") & str_detect(subjects, "54") & str_detect(subjects, "55") ~ "Commerce",
TRUE ~ "Other")) %>%
pivot_longer(cols = -c(Roll.No., subjects, course),
names_pattern = "(\\w{3})\\.(.{1})",
names_to = c(".value", "course2")) %>%
filter(course == "Commerce" & SUB %in% c(301,30,50,54) |
course == "PCB" & SUB %in% c(301,42,43,44) |
course == "PCM" & SUB %in% c(301,41,42,43)) %>%
group_by(Roll.No., course) %>%
summarise(combined_mark = sum(MRK))
Giving:
Roll.No. course combined_mark
<dbl> <chr> <dbl>
1 12653382 PCM 328
2 12653440 PCM 258
3 12653445 PCM 328
4 12653449 PCM 302
5 12653469 PCM 355
6 12653487 PCM 308
7 12653508 PCM 350
8 12653553 PCM 305
9 12653587 PCB 312
10 12653660 PCB 323
11 12653771 Commerce 247
12 12653776 Commerce 277
13 12653787 Commerce 237
14 12653806 Commerce 214
15 12653813 Commerce 275
16 12653814 Commerce 269
17 12653832 Commerce 206
18 12653850 Commerce 221
19 12653888 Commerce 222
20 12653902 Commerce 192
Answering your first question assuming that you are interested in students participating in ALL of these courses:
I am trying to create a course column consisting of PCB if the student has subject codes 42, 43, and 44. PCM if the student has subject codes 41, 42, and 43. Commerce if the codes are 55, 54, and 30.
library(tidyverse)
df %>%
mutate(across(-Roll.No., ~ as.numeric(.))) %>%
rowwise() %>%
mutate(subject_summary = case_when(sum(c_across(starts_with("SUB")) %in% c(42, 43, 44)) == 3 ~"PCB",
sum(c_across(starts_with("SUB")) %in% c(41, 42, 43)) == 3 ~ "PCM",
sum(c_across(starts_with("SUB")) %in% c(55, 54, 30)) == 3 ~ "Commerce",
TRUE ~ NA_character_)) %>%
ungroup()
gives:
# A tibble: 20 x 14
Roll.No. SUB MRK SUB.1 MRK.1 SUB.2 MRK.2 SUB.3 MRK.3 SUB.4 MRK.4 SUB.5 MRK.5 subject_summary
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 12653771 301 93 30 74 48 76 54 80 55 45 NA NA Commerce
2 12653813 301 82 30 97 41 95 54 96 55 95 64 97 Commerce
3 12653787 301 85 30 75 48 79 54 77 55 46 NA NA Commerce
4 12653850 301 74 30 73 54 74 55 44 64 76 41 NA Commerce
5 12653660 301 85 42 72 43 72 44 94 265 91 48 87 PCB
6 12653553 301 80 41 81 42 75 43 69 48 74 NA NA PCM
7 12653902 301 67 30 62 48 67 54 63 55 44 NA NA Commerce
8 12653888 301 77 30 71 48 74 54 74 55 52 NA NA Commerce
9 12653440 301 78 41 63 42 60 43 57 48 82 NA NA PCM
10 12653487 301 94 41 75 42 72 43 67 48 92 NA NA PCM
11 12653469 301 89 41 93 42 93 43 80 283 92 NA NA PCM
12 12653832 301 65 30 74 41 56 54 67 55 60 64 71 Commerce
13 12653382 301 88 41 89 42 79 43 72 48 81 NA NA PCM
14 12653814 301 89 30 91 41 68 54 89 55 49 49 97 Commerce
15 12653587 301 82 42 67 43 68 44 95 64 83 NA NA PCB
16 12653508 301 85 41 87 42 91 43 87 283 89 48 83 PCM
17 12653449 301 91 41 81 42 62 43 68 283 90 49 98 PCM
18 12653445 301 77 41 94 42 75 43 82 48 83 NA NA PCM
19 12653776 301 97 30 86 48 95 54 94 55 93 NA NA Commerce
20 12653806 301 76 30 69 48 67 54 69 55 61 NA NA Commerce
You second question can be answered by continuing above code (but showing the full code here for convenience):
df %>%
mutate(across(-Roll.No., ~ as.numeric(.))) %>%
rowwise() %>%
mutate(subject_summary = case_when(sum(c_across(starts_with("SUB")) %in% c(42, 43, 44)) == 3 ~"PCB",
sum(c_across(starts_with("SUB")) %in% c(41, 42, 43)) == 3 ~ "PCM",
sum(c_across(starts_with("SUB")) %in% c(55, 54, 30)) == 3 ~ "Commerce",
TRUE ~ NA_character_)) %>%
ungroup() %>%
pivot_longer(cols = -c(Roll.No., subject_summary, SUB, MRK),
names_pattern = "(.*)(\\d{1})",
names_to = c(".value", "course")) %>%
group_by(Roll.No.) %>%
summarize(subject_summary = first(subject_summary),
grade = case_when(all(subject_summary == "PCB") ~ sum(MRK.[SUB. %in% c(42, 43, 44)]) + first(MRK),
all(subject_summary == "PCM") ~ sum(MRK.[SUB. %in% c(41, 42, 43)]) + first(MRK),
all(subject_summary == "Commerce") ~ sum(MRK.[SUB. %in% c(55, 54, 30)]) + first(MRK)))
gives:
# A tibble: 20 x 3
Roll.No. subject_summary grade
<int> <chr> <dbl>
1 12653382 PCM 328
2 12653440 PCM 258
3 12653445 PCM 328
4 12653449 PCM 302
5 12653469 PCM 355
6 12653487 PCM 308
7 12653508 PCM 350
8 12653553 PCM 305
9 12653587 PCB 312
10 12653660 PCB 323
11 12653771 Commerce 292
12 12653776 Commerce 370
13 12653787 Commerce 283
14 12653806 Commerce 275
15 12653813 Commerce 370
16 12653814 Commerce 318
17 12653832 Commerce 266
18 12653850 Commerce 265
19 12653888 Commerce 274
20 12653902 Commerce 236
I have a dataframe structure that calculates the sum of Response.Status found per month with this mutate function:
DF1 <- complete_df %>%
mutate(Month = format(as.Date(date, format = "%Y/%m/%d"), "%m/%Y"),
UNSUBSCRIBE = if_else(UNSUBSCRIBE == "TRUE", "UNSUBSCRIBE", NA_character_)) %>%
pivot_longer(c(Response.Status, UNSUBSCRIBE), values_to = "Response.Status") %>%
drop_na() %>%
count(Month, Response.Status) %>%
pivot_wider(names_from = Month, names_sep = "/", values_from = n)
# A tibble: 7 x 16
Response.Status `01/2020` `02/2020` `03/2020` `04/2020` `05/2020` `06/2020` `07/2020` `08/2020` `09/2019` `09/2020` `10/2019` `10/2020` `11/2019` `11/2020` `12/2019`
<chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 EMAIL_OPENED 1068 3105 4063 4976 2079 1856 4249 3638 882 4140 865 2573 1167 684 862
2 NOT_RESPONDED 3187 9715 13164 15239 5458 4773 12679 10709 2798 15066 2814 8068 3641 1931 2647
3 PARTIALLY_SAVED 5 34 56 8 28 22 73 86 11 14 7 23 8 8 2
4 SUBMITTED 216 557 838 828 357 310 654 621 214 1001 233 497 264 122 194
5 SURVEY_OPENED 164 395 597 1016 245 212 513 625 110 588 123 349 202 94 120
6 UNDELIVERED_OR_BOUNCED 92 280 318 260 109 127 319 321 63 445 69 192 93 39 74
7 UNSUBSCRIBE 397 1011 1472 1568 727 737 1745 2189 372 1451 378 941 429 254 355
What I would like to do is take those values created in table to calculate average based on # of people in each Response.Status group.
structure(list(Response.Status = c("EMAIL_OPENED", "NOT_RESPONDED",
"PARTIALLY_SAVED", "SUBMITTED", "SURVEY_OPENED", "UNDELIVERED_OR_BOUNCED"
), `01/2020` = c(1068L, 3187L, 5L, 216L, 164L, 92L), `02/2020` = c(3105L,
9715L, 34L, 557L, 395L, 280L), `03/2020` = c(4063L, 13164L, 56L,
838L, 597L, 318L), `04/2020` = c(4976L, 15239L, 8L, 828L, 1016L,
260L), `05/2020` = c(2079L, 5458L, 28L, 357L, 245L, 109L), `06/2020` = c(1856L,
4773L, 22L, 310L, 212L, 127L), `07/2020` = c(4249L, 12679L, 73L,
654L, 513L, 319L), `08/2020` = c(3638L, 10709L, 86L, 621L, 625L,
321L), `09/2019` = c(882L, 2798L, 11L, 214L, 110L, 63L), `09/2020` = c(4140L,
15066L, 14L, 1001L, 588L, 445L), `10/2019` = c(865L, 2814L, 7L,
233L, 123L, 69L), `10/2020` = c(2573L, 8068L, 23L, 497L, 349L,
192L), `11/2019` = c(1167L, 3641L, 8L, 264L, 202L, 93L), `11/2020` = c(684L,
1931L, 8L, 122L, 94L, 39L), `12/2019` = c(862L, 2647L, 2L, 194L,
120L, 74L)), row.names = c(NA, -6L), class = c("tbl_df", "tbl",
"data.frame"))
I made a separate table that contains sum values based on those group names:
Response.Status
EMAIL_OPENED : 451
NOT_RESPONDED : 1563
PARTIALLY_SAVED : 4
SUBMITTED : 71
SURVEY_OPENED : 53
UNDELIVERED_OR_BOUNCED: 47
UNSUBSCRIBE: 135
If I understood your problem correctly you have 2 data.frame/tibbles. One that is shown in the "structure" part an one that informs the quantity of people/users per response status. Now you want to get the value per person. If so this is a possible solution:
# people/users data set
df2 <- data.frame(Response.Status = c("EMAIL_OPENED", "NOT_RESPONDED", "PARTIALLY_SAVED", "SUBMITTED", "SURVEY_OPENED", "UNDELIVERED_OR_BOUNCED", "UNSUBSCRIBE"),
PEOPLE = c(451, 1563, 4, 71, 53, 47, 135))
df %>% # this is your "structure"
tidyr::pivot_longer(-Response.Status, names_to = "DATE", values_to = "nmbr") %>%
dplyr::group_by(Response.Status) %>%
dplyr::summarise(SUM = sum(nmbr)) %>%
dplyr::inner_join(df2) %>%
dplyr::mutate(MEAN_PP = SUM / PEOPLE)
Response.Status SUM PEOPLE MEAN_PP
<chr> <int> <dbl> <dbl>
1 EMAIL_OPENED 36207 451 80.3
2 NOT_RESPONDED 111889 1563 71.6
3 PARTIALLY_SAVED 385 4 96.2
4 SUBMITTED 6906 71 97.3
5 SURVEY_OPENED 5353 53 101
6 UNDELIVERED_OR_BOUNCED 2801 47 59.6
I have excel dataset as follows:
Weight Quantity Price
72 5 460
73 8 720
75 20 830
95 2 490
91 15 680
82 14 340
88 30 250
89 6 770
78 27 820
98 24 940
99 29 825
I want to get a weight vs Quantity pivot table with sum of prices for each category as follows:
0-10 10-20 20-30
70-80 1180 830 820
80-90 770 340 250
90-100 490 680 1765
I had created two tables for the individual categories to get the average and count using dplyr package as follows:
table1 <- group_by(dataset, Weight = cut(Weight, breaks = c(70,80,90,100))
result1 <- summarise(table1, Count = n(), Avg_Price = mean(Price, na.rm = T))
table2 <- group_by(dataset, Quantity = cut(Quantity, breaks = c(0,10,20,30))
result2 <- summarise(table2, Count = n(), Avg_Price = mean(Price, na.rm = T))
Now, How do i use table1 and table2 to create a crosstab table as above?
Maybe the following is what you want. It uses cut like you have, then xtabs.
Weight = cut(dataset$Weight, breaks = c(70,80,90,100))
Quantity = cut(dataset$Quantity, breaks = c(0,10,20,30))
dt2 <- data.frame(Weight, Quantity, Price = dataset$Price)
xtabs(Price ~ Weight + Quantity, dt2)
# Quantity
#Weight (0,10] (10,20] (20,30]
# (70,80] 1180 830 820
# (80,90] 770 340 250
# (90,100] 490 680 1765
A dplyr and tidyr solution:
library(dplyr)
library(tidyr)
df %>%
mutate(Weight = cut(Weight, breaks = c(70,80,90,100)),
Quantity = cut(Quantity, breaks = c(0,10,20,30))) %>%
group_by(Weight, Quantity) %>%
summarise(Price = sum(Price)) %>%
spread(Quantity, Price)
# A tibble: 3 x 4
# Groups: Weight [3]
Weight `(0,10]` `(10,20]` `(20,30]`
* <fct> <int> <int> <int>
1 (70,80] 1180 830 820
2 (80,90] 770 340 250
3 (90,100] 490 680 1765
Data:
df <- structure(list(Weight = c(72L, 73L, 75L, 95L, 91L, 82L, 88L,
89L, 78L, 98L, 99L), Quantity = c(5L, 8L, 20L, 2L, 15L, 14L,
30L, 6L, 27L, 24L, 29L), Price = c(460L, 720L, 830L, 490L, 680L,
340L, 250L, 770L, 820L, 940L, 825L)), .Names = c("Weight", "Quantity",
"Price"), class = "data.frame", row.names = c(NA, -11L))
This question already has answers here:
How to sum a variable by group
(18 answers)
Closed 5 years ago.
Here is a minimal example of dataframe to reproduce.
df <- structure(list(Gene = structure(c(147L, 147L, 148L, 148L, 148L,
87L, 87L, 87L, 87L, 87L), .Label = c("genome", "k141_1189_101",
"k141_1189_104", "k141_1189_105", "k141_1189_116", "k141_1189_13",
"k141_1189_14", "k141_1189_146", "k141_1189_150", "k141_1189_18",
"k141_1189_190", "k141_1189_194", "k141_1189_215", "k141_1189_248",
"k141_1189_251", "k141_1189_252", "k141_1189_259", "k141_1189_274",
"k141_1189_283", "k141_1189_308", "k141_1189_314", "k141_1189_322",
"k141_1189_353", "k141_1189_356", "k141_1189_372", "k141_1189_373",
"k141_1189_43", "k141_1189_45", "k141_1189_72", "k141_1597_15",
"k141_1597_18", "k141_1597_23", "k141_1597_41", "k141_1597_55",
"k141_1597_66", "k141_1597_67", "k141_1597_68", "k141_1597_69",
"k141_2409_34", "k141_2409_8", "k141_3390_69", "k141_3390_83",
"k141_3390_84", "k141_3726_25", "k141_3726_31", "k141_3726_49",
"k141_3726_50", "k141_3726_62", "k141_3726_8", "k141_3726_80",
"k141_3790_1", "k141_3993_114", "k141_3993_122", "k141_3993_162",
"k141_3993_172", "k141_3993_183", "k141_3993_186", "k141_3993_188",
"k141_3993_24", "k141_3993_25", "k141_3993_28", "k141_3993_32",
"k141_3993_44", "k141_3993_47", "k141_3993_53", "k141_3993_57",
"k141_3993_68", "k141_4255_80", "k141_4255_81", "k141_4255_87",
"k141_5079_107", "k141_5079_110", "k141_5079_130", "k141_5079_14",
"k141_5079_141", "k141_5079_16", "k141_5079_184", "k141_5079_185",
"k141_5079_202", "k141_5079_24", "k141_5079_39", "k141_5079_63",
"k141_5079_65", "k141_5079_70", "k141_5079_77", "k141_5079_87",
"k141_5079_9", "k141_5313_16", "k141_5313_17", "k141_5313_20",
"k141_5313_23", "k141_5313_39", "k141_5313_5", "k141_5313_51",
"k141_5313_52", "k141_5313_78", "k141_5545_101", "k141_5545_103",
"k141_5545_104", "k141_5545_105", "k141_5545_106", "k141_5545_107",
"k141_5545_108", "k141_5545_109", "k141_5545_110", "k141_5545_111",
"k141_5545_112", "k141_5545_113", "k141_5545_114", "k141_5545_119",
"k141_5545_128", "k141_5545_130", "k141_5545_139", "k141_5545_141",
"k141_5545_145", "k141_5545_16", "k141_5545_169", "k141_5545_17",
"k141_5545_172", "k141_5545_6", "k141_5545_60", "k141_5545_62",
"k141_5545_63", "k141_5545_86", "k141_5545_87", "k141_5545_88",
"k141_5545_89", "k141_5545_91", "k141_5545_92", "k141_5545_93",
"k141_5545_94", "k141_5545_96", "k141_5545_97", "k141_5545_98",
"k141_5545_99", "k141_5734_13", "k141_5734_2", "k141_5734_4",
"k141_5734_5", "k141_5734_6", "k141_6014_124", "k141_6014_2",
"k141_6014_34", "k141_6014_75", "k141_6014_96", "k141_908_14",
"k141_908_2", "k141_908_5", "k141_957_126", "k141_957_135", "k141_957_136",
"k141_957_14", "k141_957_140", "k141_957_141", "k141_957_148",
"k141_957_179", "k141_957_191", "k141_957_35", "k141_957_47",
"k141_957_55", "k141_957_57", "k141_957_59", "k141_957_6", "k141_957_63",
"k141_957_65", "k141_957_68", "k141_957_77", "k141_957_95"), class = "factor"),
depth = c(9L, 10L, 9L, 10L, 11L, 14L, 15L, 16L, 17L, 18L),
bases_covered = c(6L, 3L, 4L, 7L, 4L, 59L, 54L, 70L, 34L,
17L), gene_length = c(1140L, 1140L, 591L, 591L, 591L, 690L,
690L, 690L, 690L, 690L), regioncoverage = c(54L, 30L, 36L,
70L, 44L, 826L, 810L, 1120L, 578L, 306L)), .Names = c("Gene",
"depth", "bases_covered", "gene_length", "regioncoverage"), row.names = c(1L,
2L, 33L, 34L, 35L, 78L, 79L, 80L, 81L, 82L), class = "data.frame")
The dataframe looks like this:
Gene depth bases_covered gene_length regioncoverage
1 k141_908_2 9 6 1140 54
2 k141_908_2 10 3 1140 30
33 k141_908_5 9 4 591 36
34 k141_908_5 10 7 591 70
35 k141_908_5 11 4 591 44
78 k141_5079_9 14 59 690 826
79 k141_5079_9 15 54 690 810
80 k141_5079_9 16 70 690 1120
81 k141_5079_9 17 34 690 578
82 k141_5079_9 18 17 690 306
What i want is that for each Gene (e.g k141_908_2) i want to sum region coverage and divide by unique(gene length). In fact gene length is always the same value for each gene.
For example for Gene K141_908_2 i would do: (54+30)/1140 = 0.07
For example for Gene K141_908_5 i would do: (36+70+44)/591 = 0.25
The final dataframe should report two columns.
Gene Newcoverage
1 k141_908_2 0.07
2 k141_908_5 0.25
3 ......
and so on .
Thanks for your help
This is straightforward with dplyr:
library(dplyr)
df_final <- df %>%
group_by(Gene) %>%
summarize(Newcoverage = sum(regioncoverage) / first(gene_length))
df_final
# # A tibble: 3 × 2
# Gene Newcoverage
# <fctr> <dbl>
# 1 k141_5079_9 5.27536232
# 2 k141_908_2 0.07368421
# 3 k141_908_5 0.25380711
I needed to set the first column to character and others to numeric. But after that you can just split the df by gene and then do the necessary calculations.
df[,2:5] = lapply(df[,2:5], as.numeric)
df$Gene = as.character(df$Gene)
sapply(split(df, df$Gene), function(x) sum(x[,5]/x[1,4]))
#k141_5079_9 k141_908_2 k141_908_5
# 5.27536232 0.07368421 0.25380711
We can use tidyverse
library(tidyverse)
df %>%
group_by(Gene) %>%
summarise(Newcoverage = sum(regioncoverage)/gene_length[1])
# A tibble: 3 × 2
# Gene Newcoverage
# <fctr> <dbl>
#1 k141_5079_9 5.27536232
#2 k141_908_2 0.07368421
#3 k141_908_5 0.25380711
Or a base R option is
by(df[4:5], list(as.character(df[,'Gene'])), FUN= function(x) sum(x[,2])/x[1,1])
quick approach is
require(data.table)
DT <- setDT(df)
#just to output unique rows
DT[, .(New_Coverage = unique(sum(regioncoverage)/gene_length)), by = .(Gene)]
output
Gene New_Coverage
1: k141_908_2 0.07368421
2: k141_908_5 0.25380711
3: k141_5079_9 5.27536232
I use dplyr a lot. So here's one way:
library(dplyr)
df %>%
group_by(Gene) %>%
mutate(Newcoverage=sum(regioncoverage)/unique(gene_length))
If you want only unique values per Gene:
df %>%
group_by(Gene) %>%
transmute(Newcoverage=sum(regioncoverage)/unique(gene_length)) %>%
unique()