How to create a crosstab table using two tables in R? - r

I have excel dataset as follows:
Weight Quantity Price
72 5 460
73 8 720
75 20 830
95 2 490
91 15 680
82 14 340
88 30 250
89 6 770
78 27 820
98 24 940
99 29 825
I want to get a weight vs Quantity pivot table with sum of prices for each category as follows:
0-10 10-20 20-30
70-80 1180 830 820
80-90 770 340 250
90-100 490 680 1765
I had created two tables for the individual categories to get the average and count using dplyr package as follows:
table1 <- group_by(dataset, Weight = cut(Weight, breaks = c(70,80,90,100))
result1 <- summarise(table1, Count = n(), Avg_Price = mean(Price, na.rm = T))
table2 <- group_by(dataset, Quantity = cut(Quantity, breaks = c(0,10,20,30))
result2 <- summarise(table2, Count = n(), Avg_Price = mean(Price, na.rm = T))
Now, How do i use table1 and table2 to create a crosstab table as above?

Maybe the following is what you want. It uses cut like you have, then xtabs.
Weight = cut(dataset$Weight, breaks = c(70,80,90,100))
Quantity = cut(dataset$Quantity, breaks = c(0,10,20,30))
dt2 <- data.frame(Weight, Quantity, Price = dataset$Price)
xtabs(Price ~ Weight + Quantity, dt2)
# Quantity
#Weight (0,10] (10,20] (20,30]
# (70,80] 1180 830 820
# (80,90] 770 340 250
# (90,100] 490 680 1765

A dplyr and tidyr solution:
library(dplyr)
library(tidyr)
df %>%
mutate(Weight = cut(Weight, breaks = c(70,80,90,100)),
Quantity = cut(Quantity, breaks = c(0,10,20,30))) %>%
group_by(Weight, Quantity) %>%
summarise(Price = sum(Price)) %>%
spread(Quantity, Price)
# A tibble: 3 x 4
# Groups: Weight [3]
Weight `(0,10]` `(10,20]` `(20,30]`
* <fct> <int> <int> <int>
1 (70,80] 1180 830 820
2 (80,90] 770 340 250
3 (90,100] 490 680 1765
Data:
df <- structure(list(Weight = c(72L, 73L, 75L, 95L, 91L, 82L, 88L,
89L, 78L, 98L, 99L), Quantity = c(5L, 8L, 20L, 2L, 15L, 14L,
30L, 6L, 27L, 24L, 29L), Price = c(460L, 720L, 830L, 490L, 680L,
340L, 250L, 770L, 820L, 940L, 825L)), .Names = c("Weight", "Quantity",
"Price"), class = "data.frame", row.names = c(NA, -11L))

Related

Rowwise proportion test and add p value as new column

My data:
c5 =structure(list(comorbid = c("heart", "ihd", "cabg", "angio",
"cerebrovasc", "diabetes", "pvd", "amputation", "liver", "malig",
"smoke", "ulcers"), AVF_Y = c(626L, 355L, 266L, 92L, 320L, 1175L,
199L, 89L, 75L, 450L, 901L, 114L), AVG_Y = c(54L, 14L, 18L, 5L,
21L, 37L, 5L, 7L, 5L, 29L, 33L, 3L), AVF_tot = c(2755L, 1768L,
2770L, 2831L, 2844L, 2877L, 1745L, 2823L, 2831L, 2823L, 2798L,
2829L), AVG_tot = c(161L, 61L, 161L, 165L, 166L, 167L, 61L, 165L,
165L, 165L, 159L, 164L)), row.names = c(NA, -12L), class = "data.frame")
I want to perform a prop.test for each row ( a two-proportions z-test) and add the p value as a new column.
I've tried using the following code, but this gives me 24 1-sample proportions test results instead of 12 2-sample test for equality of proportions.
Map(prop.test, x = c(c5$AVF_Y, c5$AVG_Y), n = c(c5$AVF_tot, c5$AVG_tot))
Use a lambda function and extract. When we concatenate the columns, it returns a vector and its length will be 2 times the number of rows of the data. We would need to concatenate within in the loop to create a vector of length 2 for each x and n from corresponding columns of '_Y', and '_tot'
mapply(function(avf, avg, avf_n, avg_n) prop.test(c(avf, avg), c(avf_n, avg_n))$p.value, c5$AVF_Y, c5$AVG_Y, c5$AVF_tot, c5$AVG_tot)
-output
[1] 2.218376e-03 6.985883e-01 6.026012e-01 1.000000e+00 6.695440e-01 2.425781e-06 5.672322e-01 5.861097e-01 9.627050e-01 6.546286e-01 3.360300e-03 2.276857e-0
Or use do.cal with Map or mapply
do.call(mapply, c(FUN = function(x, y, n1, n2)
prop.test(c(x, y), c(n1, n2))$p.value, unname(c5[-1])))
[1] 2.218376e-03 6.985883e-01 6.026012e-01 1.000000e+00 6.695440e-01 2.425781e-06 5.672322e-01 5.861097e-01 9.627050e-01 6.546286e-01 3.360300e-03 2.276857e-01
Or with apply
apply(c5[-1], 1, function(x) prop.test(x[1:2], x[3:4])$p.value)
[1] 2.218376e-03 6.985883e-01 6.026012e-01 1.000000e+00 6.695440e-01 2.425781e-06 5.672322e-01 5.861097e-01 9.627050e-01 6.546286e-01 3.360300e-03 2.276857e-01
Or use rowwise
library(dplyr)
c5 %>%
rowwise %>%
mutate(pval = prop.test(c(AVF_Y, AVG_Y),
n = c(AVF_tot, AVG_tot))$p.value) %>%
ungroup
-output
# A tibble: 12 × 6
comorbid AVF_Y AVG_Y AVF_tot AVG_tot pval
<chr> <int> <int> <int> <int> <dbl>
1 heart 626 54 2755 161 0.00222
2 ihd 355 14 1768 61 0.699
3 cabg 266 18 2770 161 0.603
4 angio 92 5 2831 165 1.00
5 cerebrovasc 320 21 2844 166 0.670
6 diabetes 1175 37 2877 167 0.00000243
7 pvd 199 5 1745 61 0.567
8 amputation 89 7 2823 165 0.586
9 liver 75 5 2831 165 0.963
10 malig 450 29 2823 165 0.655
11 smoke 901 33 2798 159 0.00336
12 ulcers 114 3 2829 164 0.228

Calculate average based on columns in 2 datafarmes and their values via mutate in R?

I have a dataframe structure that calculates the sum of Response.Status found per month with this mutate function:
DF1 <- complete_df %>%
mutate(Month = format(as.Date(date, format = "%Y/%m/%d"), "%m/%Y"),
UNSUBSCRIBE = if_else(UNSUBSCRIBE == "TRUE", "UNSUBSCRIBE", NA_character_)) %>%
pivot_longer(c(Response.Status, UNSUBSCRIBE), values_to = "Response.Status") %>%
drop_na() %>%
count(Month, Response.Status) %>%
pivot_wider(names_from = Month, names_sep = "/", values_from = n)
# A tibble: 7 x 16
Response.Status `01/2020` `02/2020` `03/2020` `04/2020` `05/2020` `06/2020` `07/2020` `08/2020` `09/2019` `09/2020` `10/2019` `10/2020` `11/2019` `11/2020` `12/2019`
<chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 EMAIL_OPENED 1068 3105 4063 4976 2079 1856 4249 3638 882 4140 865 2573 1167 684 862
2 NOT_RESPONDED 3187 9715 13164 15239 5458 4773 12679 10709 2798 15066 2814 8068 3641 1931 2647
3 PARTIALLY_SAVED 5 34 56 8 28 22 73 86 11 14 7 23 8 8 2
4 SUBMITTED 216 557 838 828 357 310 654 621 214 1001 233 497 264 122 194
5 SURVEY_OPENED 164 395 597 1016 245 212 513 625 110 588 123 349 202 94 120
6 UNDELIVERED_OR_BOUNCED 92 280 318 260 109 127 319 321 63 445 69 192 93 39 74
7 UNSUBSCRIBE 397 1011 1472 1568 727 737 1745 2189 372 1451 378 941 429 254 355
What I would like to do is take those values created in table to calculate average based on # of people in each Response.Status group.
structure(list(Response.Status = c("EMAIL_OPENED", "NOT_RESPONDED",
"PARTIALLY_SAVED", "SUBMITTED", "SURVEY_OPENED", "UNDELIVERED_OR_BOUNCED"
), `01/2020` = c(1068L, 3187L, 5L, 216L, 164L, 92L), `02/2020` = c(3105L,
9715L, 34L, 557L, 395L, 280L), `03/2020` = c(4063L, 13164L, 56L,
838L, 597L, 318L), `04/2020` = c(4976L, 15239L, 8L, 828L, 1016L,
260L), `05/2020` = c(2079L, 5458L, 28L, 357L, 245L, 109L), `06/2020` = c(1856L,
4773L, 22L, 310L, 212L, 127L), `07/2020` = c(4249L, 12679L, 73L,
654L, 513L, 319L), `08/2020` = c(3638L, 10709L, 86L, 621L, 625L,
321L), `09/2019` = c(882L, 2798L, 11L, 214L, 110L, 63L), `09/2020` = c(4140L,
15066L, 14L, 1001L, 588L, 445L), `10/2019` = c(865L, 2814L, 7L,
233L, 123L, 69L), `10/2020` = c(2573L, 8068L, 23L, 497L, 349L,
192L), `11/2019` = c(1167L, 3641L, 8L, 264L, 202L, 93L), `11/2020` = c(684L,
1931L, 8L, 122L, 94L, 39L), `12/2019` = c(862L, 2647L, 2L, 194L,
120L, 74L)), row.names = c(NA, -6L), class = c("tbl_df", "tbl",
"data.frame"))
I made a separate table that contains sum values based on those group names:
Response.Status
EMAIL_OPENED : 451
NOT_RESPONDED : 1563
PARTIALLY_SAVED : 4
SUBMITTED : 71
SURVEY_OPENED : 53
UNDELIVERED_OR_BOUNCED: 47
UNSUBSCRIBE: 135
If I understood your problem correctly you have 2 data.frame/tibbles. One that is shown in the "structure" part an one that informs the quantity of people/users per response status. Now you want to get the value per person. If so this is a possible solution:
# people/users data set
df2 <- data.frame(Response.Status = c("EMAIL_OPENED", "NOT_RESPONDED", "PARTIALLY_SAVED", "SUBMITTED", "SURVEY_OPENED", "UNDELIVERED_OR_BOUNCED", "UNSUBSCRIBE"),
PEOPLE = c(451, 1563, 4, 71, 53, 47, 135))
df %>% # this is your "structure"
tidyr::pivot_longer(-Response.Status, names_to = "DATE", values_to = "nmbr") %>%
dplyr::group_by(Response.Status) %>%
dplyr::summarise(SUM = sum(nmbr)) %>%
dplyr::inner_join(df2) %>%
dplyr::mutate(MEAN_PP = SUM / PEOPLE)
Response.Status SUM PEOPLE MEAN_PP
<chr> <int> <dbl> <dbl>
1 EMAIL_OPENED 36207 451 80.3
2 NOT_RESPONDED 111889 1563 71.6
3 PARTIALLY_SAVED 385 4 96.2
4 SUBMITTED 6906 71 97.3
5 SURVEY_OPENED 5353 53 101
6 UNDELIVERED_OR_BOUNCED 2801 47 59.6

Finding Unique per group with filter

My data looks like this:
date schedule_id food_truck_id building_id truck_status last_confirmed_date dsle
2018-04-26 422 58 30 accepted_event 0 31
2018-04-26 422 59 30 accepted_event 2018-02-27 11
2018-04-26 422 65 30 accepted_event 2018-03-15 12
2018-04-26 422 88 30 accepted_event 2018-02-20 7
2018-04-26 422 89 30 accepted_event 2018-03-22 13
2018-04-26 422 101 30 accepted_event 2018-02-06 16
2018-04-26 422 120 30 accepted_event 2018-03-06 14
2018-04-26 422 135 30 accepted_event 2018-03-13 21
2018-04-26 399 42 33 accepted_event 2018-03-15 8
2018-04-26 399 58 33 accepted_event 0 31
2018-04-26 399 59 33 accepted_event 2018-03-01 11
2018-04-26 399 65 33 accepted_event 2018-02-27 12
2018-04-26 399 88 33 accepted_event
Can be reproduced using:
structure(list(date = structure(c(17647, 17647, 17647, 17647,
17647, 17647, 17647, 17647, 17647, 17647, 17647, 17647, 17647,
17647, 17647, 17647, 17647), class = "Date"), schedule_id = c(422L,
422L, 422L, 422L, 422L, 422L, 422L, 422L, 399L, 399L, 399L, 399L,
399L, 399L, 399L, 399L, 399L), food_truck_id = c(58L, 59L, 65L,
88L, 89L, 101L, 120L, 135L, 42L, 58L, 59L, 65L, 88L, 89L, 101L,
120L, 135L), building_id = c(30L, 30L, 30L, 30L, 30L, 30L, 30L,
30L, 33L, 33L, 33L, 33L, 33L, 33L, 33L, 33L, 33L), truck_status = c("accepted_event",
"accepted_event", "accepted_event", "accepted_event", "accepted_event",
"accepted_event", "accepted_event", "accepted_event", "accepted_event",
"accepted_event", "accepted_event", "accepted_event", "accepted_event",
"accepted_event", "accepted_event", "accepted_event", "accepted_event"
), last_confirmed_date = c("0", "2018-02-27", "2018-03-15", "2018-02-20",
"2018-03-22", "2018-02-06", "2018-03-06", "2018-03-13", "2018-03-15",
"0", "2018-03-01", "2018-02-27", "0", "2018-03-06", "2018-03-13",
"0", "2018-02-22"), dsle = c(31, 11, 12, 7, 13, 16, 14, 21, 8,
31, 11, 12, 7, 13, 16, 14, 21)), .Names = c("date", "schedule_id",
"food_truck_id", "building_id", "truck_status", "last_confirmed_date",
"dsle"), row.names = c(142L, 223L, 379L, 455L, 495L, 589L, 806L,
877L, 63L, 155L, 215L, 287L, 452L, 483L, 667L, 809L, 894L), class = "data.frame")
My goal is to only select the food_truck_id based on max(dsle) but it should be unique per date. For instance, for schedule_id 422, food_truck_id with max(dsle) is 58, it is also 58 for schedule_id 399.
What I want is, let's say for 422, it is 58, but for 399, it should be next max(dsle) other than 58.
I have tried the following but it doesn't gives what I want.
testxx %>%
group_by(schedule_id) %>%
distinct(food_truck_id, date, dsle) %>%
filter(dsle == max(dsle))
The result I want is following
date schedule_id food_truck_id
2018-04-26 422 58
2018-04-26 399 135
because 135 next to 58 has max(dsle)
Updated to account for date
This might be one of those occasions where a loop is the best/easiest solution.
However, it does a join operation in the loop, so there will be some optimisations that can be made
The idea is to loop over each schedule_id, and keep track of which food_trucks have already been used on which date.
If we do some pre-arranging of the data before the loop it makes things easier
df <- df %>%
arrange(schedule_id, -dsle)
## pre-allocate a result data.frame
ids <- unique(df$schedule_id)
df_res <- data.frame(schedule_id = ids,
food_truck_id = NA)
usedTrucks <- data.frame(date = as.Date(NA),
schedule_id = ids,
food_truck_id = NA_integer_)
counter <- 1
for(i in ids) {
possibleTrucks <- df[df$schedule_id %in% i, c("date", "food_truck_id")]
## possible Trucks will be in order, as we have pre-arranged the data
## use the first one that hasn't already been used
## on the given date
possibleTrucks <- anti_join(possibleTrucks, usedTrucks, by = c("date", "food_truck_id"))
thisTruck <- possibleTrucks[1, c("food_truck_id", "date")]
df_res[counter, 'food_truck_id'] <- thisTruck$food_truck_id
usedTrucks[counter, "food_truck_id"] <- thisTruck$food_truck_id
usedTrucks[counter, "date"] <- thisTruck$date
counter <- counter + 1
}
df_res
# schedule_id food_truck_id
# 1 399 58
# 2 422 135
If speed is an issue on a larger data set this can be re-written in Rcpp to make it much faster.
p<-df %>% arrange(desc(schedule_id), desc(dsle)) %>% slice(1) %>% select(date,dsle,schedule_id,food_truck_id)
df %>% subset(!(schedule_id%in%c(p))) %>% subset(!(dsle%in%c(p))) %>% select(date,dsle,schedule_id,food_truck_id) %>% arrange(desc(dsle)) %>% slice(1) %>%
rbind(p,.) %>% select(-dsle)
output
# A tibble: 2 x 3
date schedule_id food_truck_id
<date> <int> <int>
1 2018-04-26 422 58
2 2018-04-26 399 135

Subset column and compute operations for each subset [duplicate]

This question already has answers here:
How to sum a variable by group
(18 answers)
Closed 5 years ago.
Here is a minimal example of dataframe to reproduce.
df <- structure(list(Gene = structure(c(147L, 147L, 148L, 148L, 148L,
87L, 87L, 87L, 87L, 87L), .Label = c("genome", "k141_1189_101",
"k141_1189_104", "k141_1189_105", "k141_1189_116", "k141_1189_13",
"k141_1189_14", "k141_1189_146", "k141_1189_150", "k141_1189_18",
"k141_1189_190", "k141_1189_194", "k141_1189_215", "k141_1189_248",
"k141_1189_251", "k141_1189_252", "k141_1189_259", "k141_1189_274",
"k141_1189_283", "k141_1189_308", "k141_1189_314", "k141_1189_322",
"k141_1189_353", "k141_1189_356", "k141_1189_372", "k141_1189_373",
"k141_1189_43", "k141_1189_45", "k141_1189_72", "k141_1597_15",
"k141_1597_18", "k141_1597_23", "k141_1597_41", "k141_1597_55",
"k141_1597_66", "k141_1597_67", "k141_1597_68", "k141_1597_69",
"k141_2409_34", "k141_2409_8", "k141_3390_69", "k141_3390_83",
"k141_3390_84", "k141_3726_25", "k141_3726_31", "k141_3726_49",
"k141_3726_50", "k141_3726_62", "k141_3726_8", "k141_3726_80",
"k141_3790_1", "k141_3993_114", "k141_3993_122", "k141_3993_162",
"k141_3993_172", "k141_3993_183", "k141_3993_186", "k141_3993_188",
"k141_3993_24", "k141_3993_25", "k141_3993_28", "k141_3993_32",
"k141_3993_44", "k141_3993_47", "k141_3993_53", "k141_3993_57",
"k141_3993_68", "k141_4255_80", "k141_4255_81", "k141_4255_87",
"k141_5079_107", "k141_5079_110", "k141_5079_130", "k141_5079_14",
"k141_5079_141", "k141_5079_16", "k141_5079_184", "k141_5079_185",
"k141_5079_202", "k141_5079_24", "k141_5079_39", "k141_5079_63",
"k141_5079_65", "k141_5079_70", "k141_5079_77", "k141_5079_87",
"k141_5079_9", "k141_5313_16", "k141_5313_17", "k141_5313_20",
"k141_5313_23", "k141_5313_39", "k141_5313_5", "k141_5313_51",
"k141_5313_52", "k141_5313_78", "k141_5545_101", "k141_5545_103",
"k141_5545_104", "k141_5545_105", "k141_5545_106", "k141_5545_107",
"k141_5545_108", "k141_5545_109", "k141_5545_110", "k141_5545_111",
"k141_5545_112", "k141_5545_113", "k141_5545_114", "k141_5545_119",
"k141_5545_128", "k141_5545_130", "k141_5545_139", "k141_5545_141",
"k141_5545_145", "k141_5545_16", "k141_5545_169", "k141_5545_17",
"k141_5545_172", "k141_5545_6", "k141_5545_60", "k141_5545_62",
"k141_5545_63", "k141_5545_86", "k141_5545_87", "k141_5545_88",
"k141_5545_89", "k141_5545_91", "k141_5545_92", "k141_5545_93",
"k141_5545_94", "k141_5545_96", "k141_5545_97", "k141_5545_98",
"k141_5545_99", "k141_5734_13", "k141_5734_2", "k141_5734_4",
"k141_5734_5", "k141_5734_6", "k141_6014_124", "k141_6014_2",
"k141_6014_34", "k141_6014_75", "k141_6014_96", "k141_908_14",
"k141_908_2", "k141_908_5", "k141_957_126", "k141_957_135", "k141_957_136",
"k141_957_14", "k141_957_140", "k141_957_141", "k141_957_148",
"k141_957_179", "k141_957_191", "k141_957_35", "k141_957_47",
"k141_957_55", "k141_957_57", "k141_957_59", "k141_957_6", "k141_957_63",
"k141_957_65", "k141_957_68", "k141_957_77", "k141_957_95"), class = "factor"),
depth = c(9L, 10L, 9L, 10L, 11L, 14L, 15L, 16L, 17L, 18L),
bases_covered = c(6L, 3L, 4L, 7L, 4L, 59L, 54L, 70L, 34L,
17L), gene_length = c(1140L, 1140L, 591L, 591L, 591L, 690L,
690L, 690L, 690L, 690L), regioncoverage = c(54L, 30L, 36L,
70L, 44L, 826L, 810L, 1120L, 578L, 306L)), .Names = c("Gene",
"depth", "bases_covered", "gene_length", "regioncoverage"), row.names = c(1L,
2L, 33L, 34L, 35L, 78L, 79L, 80L, 81L, 82L), class = "data.frame")
The dataframe looks like this:
Gene depth bases_covered gene_length regioncoverage
1 k141_908_2 9 6 1140 54
2 k141_908_2 10 3 1140 30
33 k141_908_5 9 4 591 36
34 k141_908_5 10 7 591 70
35 k141_908_5 11 4 591 44
78 k141_5079_9 14 59 690 826
79 k141_5079_9 15 54 690 810
80 k141_5079_9 16 70 690 1120
81 k141_5079_9 17 34 690 578
82 k141_5079_9 18 17 690 306
What i want is that for each Gene (e.g k141_908_2) i want to sum region coverage and divide by unique(gene length). In fact gene length is always the same value for each gene.
For example for Gene K141_908_2 i would do: (54+30)/1140 = 0.07
For example for Gene K141_908_5 i would do: (36+70+44)/591 = 0.25
The final dataframe should report two columns.
Gene Newcoverage
1 k141_908_2 0.07
2 k141_908_5 0.25
3 ......
and so on .
Thanks for your help
This is straightforward with dplyr:
library(dplyr)
df_final <- df %>%
group_by(Gene) %>%
summarize(Newcoverage = sum(regioncoverage) / first(gene_length))
df_final
# # A tibble: 3 × 2
# Gene Newcoverage
# <fctr> <dbl>
# 1 k141_5079_9 5.27536232
# 2 k141_908_2 0.07368421
# 3 k141_908_5 0.25380711
I needed to set the first column to character and others to numeric. But after that you can just split the df by gene and then do the necessary calculations.
df[,2:5] = lapply(df[,2:5], as.numeric)
df$Gene = as.character(df$Gene)
sapply(split(df, df$Gene), function(x) sum(x[,5]/x[1,4]))
#k141_5079_9 k141_908_2 k141_908_5
# 5.27536232 0.07368421 0.25380711
We can use tidyverse
library(tidyverse)
df %>%
group_by(Gene) %>%
summarise(Newcoverage = sum(regioncoverage)/gene_length[1])
# A tibble: 3 × 2
# Gene Newcoverage
# <fctr> <dbl>
#1 k141_5079_9 5.27536232
#2 k141_908_2 0.07368421
#3 k141_908_5 0.25380711
Or a base R option is
by(df[4:5], list(as.character(df[,'Gene'])), FUN= function(x) sum(x[,2])/x[1,1])
quick approach is
require(data.table)
DT <- setDT(df)
#just to output unique rows
DT[, .(New_Coverage = unique(sum(regioncoverage)/gene_length)), by = .(Gene)]
output
Gene New_Coverage
1: k141_908_2 0.07368421
2: k141_908_5 0.25380711
3: k141_5079_9 5.27536232
I use dplyr a lot. So here's one way:
library(dplyr)
df %>%
group_by(Gene) %>%
mutate(Newcoverage=sum(regioncoverage)/unique(gene_length))
If you want only unique values per Gene:
df %>%
group_by(Gene) %>%
transmute(Newcoverage=sum(regioncoverage)/unique(gene_length)) %>%
unique()

Calculate stats in concatenated strings in R

Suppose I have a dataframe like this:
X. Name Type Total HP Attack Defense Sp..Atk Sp..Def Speed
795 718 Zygarde50% Forme Dragon/Ground 600 108 100 121 81 95 95
796 719 Diancie Rock/Fairy 600 50 100 150 100 150 50
797 719 DiancieMega Diancie Rock/Fairy 700 50 160 110 160 110 110
798 720 HoopaHoopa Confined Psychic/Ghost 600 80 110 60 150 130 70
799 720 HoopaHoopa Unbound Psychic/Dark 680 80 160 60 170 130 80
800 721 Volcanion Fire/Water 600 80 110 120 130 90 70
If I want to calculate the average stats (Total, HP, Attack, Defense, etc...), per type Dragon, type Ground, type Rock, type Fairy, etc... (instead of type Dragon/Ground, Rock/Fairy), how would I proceed? The stats of pokemons that belong to any two types would be used in calculating the average stats for both.
I have written the code using functions in the dplyr package:
summaryStats_byType<- summarise(byType,
count = n(),
averageTotal = mean(Total, na.rm = T),
averageHP = mean(HP, na.rm = T),
averageDefense = mean(Defense, na.rm = T),
averageSpAtk = mean(Sp..Atk, na.rm = T),
averageSpDef = mean(Sp..Def, na.rm = T),
averageSpeed = mean(Speed, na.rm = T))
but obviously it counts "Dragon/Ground" as a type instead of two.
One way is to split the Type column in long format (I chose cSplit from splitstackshape to do this) and group_by as usual, i.e.
library(splitstackshape)
library(dplyr)
df1 <- cSplit(df, 'Type', sep = '/', 'long')
df1 %>%
group_by(Type) %>%
summarise_each(funs(mean), -c(X., Name))
# A tibble: 9 × 8
# Type Total HP Attack Defense Sp..Atk Sp..Def Speed
# <fctr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 Dark 680 80 160 60 170 130 80
#2 Dragon 600 108 100 121 81 95 95
#3 Fairy 650 50 130 130 130 130 80
#4 Fire 600 80 110 120 130 90 70
#5 Ghost 600 80 110 60 150 130 70
#6 Ground 600 108 100 121 81 95 95
#7 Psychic 640 80 135 60 160 130 75
#8 Rock 650 50 130 130 130 130 80
#9 Water 600 80 110 120 130 90 70
Alternatively (as noted by #DavidArenburg) we can also use separate_rows from tidyr as part of the pipe, i.e.
library(tidyr)
library(dplyr)
df %>%
separate_rows(Type) %>%
group_by(Type) %>%
summarise_each(funs(mean), -c(X., Name))
which of course yields the same results
DATA
dput(df)
structure(list(X. = c(718L, 719L, 719L, 720L, 720L, 721L), Name = structure(c(6L,
1L, 2L, 3L, 4L, 5L), .Label = c("Diancie", "DiancieMega_Diancie",
"HoopaHoopa_Confined", "HoopaHoopa_Unbound", "Volcanion", "Zygarde50%_Forme"
), class = "factor"), Type = structure(c(1L, 5L, 5L, 4L, 3L,
2L), .Label = c("Dragon/Ground", "Fire/Water", "Psychic/Dark",
"Psychic/Ghost", "Rock/Fairy"), class = "factor"), Total = c(600L,
600L, 700L, 600L, 680L, 600L), HP = c(108L, 50L, 50L, 80L, 80L,
80L), Attack = c(100L, 100L, 160L, 110L, 160L, 110L), Defense = c(121L,
150L, 110L, 60L, 60L, 120L), Sp..Atk = c(81L, 100L, 160L, 150L,
170L, 130L), Sp..Def = c(95L, 150L, 110L, 130L, 130L, 90L), Speed = c(95L,
50L, 110L, 70L, 80L, 70L)), .Names = c("X.", "Name", "Type",
"Total", "HP", "Attack", "Defense", "Sp..Atk", "Sp..Def", "Speed"
), class = "data.frame", row.names = c("795", "796", "797", "798",
"799", "800"))

Resources