R, calculating last 3 average - r

I have the following data frame in R. It contains the statistics of each player in the Olympics basketball tournament
Sample Table
Each game is denoted by a number in the game column. I would like to create a new column with the average of the last 3 games. When following examples in similar posts, my biggest down fall is having games numbers instead of actual dates that seem to be required for other methods.
Any assistance would be greatly appreciated.
Thanks
EDIT:
To clarify a little more based on some of the solutions and suggestions. For each row I would like to have the new column show the Average minutes or points from the last 3 games. So far the suggestion make each row show the average of games 3, 4, & 5.
So for example.
Player A, game = 3
Avg Pts = mean(pts game1, pts game2, pts game3)
Player B, game = 4
Avg pts = mean(pts game2 ,pts game3, pts game4)
I hope that clears it up.
Thanks
Data:
I am very new at this. I hope this is the appropriate method for sharing data.
structure(list(Player = structure(c(1L, 2L, 6L, 8L, 17L, 21L,
23L, 24L, 24L, 24L, 24L, 25L, 26L, 15L, 20L, 20L, 12L, 15L, 11L,
5L, 15L, 16L, 14L, 9L, 20L, 11L, 18L, 4L, 12L, 9L, 4L, 9L, 20L,
12L, 5L, 13L, 22L, 7L, 11L, 20L, 4L, 5L, 10L, 11L, 14L, 19L,
3L, 7L, 14L, 5L), .Label = c("Adas Juskevicius", "Alex Abrines",
"Andrew Bogut", "Bojan Bogdanovic", "Boris Diaw", "Brock Motum",
"Dario Saric", "Dwight Lewis", "Facundo Campazzo", "Ike Diogu",
"Jianlian Yi", "Jonas Maciulis", "Kevin Durant", "Luis Scola",
"Mantas Kalnietis", "Matt Dellavedova", "Miguel Marriaga", "Milos Teodosic",
"Nikola Mirotic", "Pau Gasol", "Rafa Luz", "Ricky Rubio", "Roberto Acuna",
"Vaidas Kariniauskas", "Windi Graterol", "Zeljko Sakic"), class = "factor"),
Team = structure(c(8L, 6L, 2L, 12L, 12L, 3L, 1L, 8L, 8L,
8L, 8L, 12L, 5L, 8L, 6L, 6L, 8L, 8L, 4L, 7L, 8L, 2L, 1L,
1L, 6L, 4L, 10L, 5L, 8L, 1L, 5L, 1L, 6L, 8L, 7L, 11L, 6L,
5L, 4L, 6L, 5L, 7L, 9L, 4L, 1L, 6L, 2L, 5L, 1L, 7L), .Label = c("ARG",
"AUS", "BRZ", "CHN", "CRO", "ESP", "FRA", "LTU", "NGR", "SRB",
"USA", "VEN"), class = "factor"), Pos = structure(c(3L, 4L,
2L, 5L, 2L, 5L, 1L, 2L, 2L, 2L, 2L, 1L, 4L, 3L, 1L, 1L, 4L,
5L, 2L, 2L, 5L, 3L, 2L, 3L, 1L, 4L, 5L, 2L, 2L, 3L, 2L, 3L,
1L, 2L, 2L, 4L, 3L, 4L, 4L, 1L, 2L, 2L, 2L, 4L, 1L, 2L, 1L,
4L, 1L, 2L), .Label = c("C", "PF", "PG", "SF", "SG"), class = "factor"),
game = c(4L, 5L, 4L, 5L, 3L, 4L, 3L, 1L, 2L, 3L, 4L, 5L,
5L, 3L, 2L, 3L, 3L, 4L, 3L, 3L, 2L, 4L, 3L, 3L, 5L, 5L, 5L,
4L, 2L, 2L, 2L, 5L, 4L, 4L, 2L, 2L, 1L, 4L, 4L, 1L, 5L, 4L,
3L, 2L, 4L, 2L, 2L, 3L, 2L, 1L), Status = c(0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), Drafted = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 85,
82, 80, 78, 77, 74, 68, 68, 68, 65, 64, 63, 62, 62, 61, 61,
60, 59, 59, 59, 58, 57, 57, 57, 56, 56, 56, 55, 55, 55, 55,
54, 54, 53, 53, 52, 51), Min = c(11.04, 1.44, 16.56, 2.88,
4.8, 1.92, 13.68, 3.84, 9.36, 2.64, 21.12, 17.04, 0.24, 36.48,
32.16, 23.28, 26.88, 17.28, 33.6, 28.56, 30.48, 19.92, 30.24,
25.92, 27.84, 34.8, 15.12, 36, 28.8, 29.04, 29.28, 21.36,
23.04, 18.72, 21.12, 25.2, 12.24, 27.12, 32.88, 31.92, 34.08,
18.24, 27.6, 32.64, 33.6, 32.88, 24.72, 34.8, 35.76, 31.44
), FIC = c(3.8, 1.5, 10.2, 1, 0, -1, 0.2, 0.5, -3.2, -1,
0.6, 4.5, -0.5, 15.6, 9.5, 11.1, 0.5, 7.8, 17, 16.8, 25.2,
10.5, 10, 6, 14.4, 6, 7.5, 15.5, 14.8, 6.2, 7.9, 3, 26.9,
0.8, 11.4, 16, -1, 4.9, 14.1, 18.5, 5.9, 6.5, 10, 10, 10,
8, 19, 9, 12.1, 7.5), FP = c(8, 4, 21.75, 2, 2.75, -0.5,
4.75, 1.5, 2.5, 1.25, 8.5, 13, 0, 35.25, 37, 32.25, 17, 18.5,
39.5, 34.25, 49, 19.25, 28.75, 20.25, 41.25, 27.5, 16.5,
39.25, 33.5, 29, 30.75, 13.25, 47.25, 9, 24.5, 28.5, 6.25,
19.5, 38.25, 40.25, 27.5, 17, 21.75, 37.5, 29, 21, 38.5,
30.75, 37.75, 25.75), FPM = c(0.72463768115942, 2.77777777777778,
1.31340579710145, 0.694444444444444, 0.572916666666667, -0.260416666666667,
0.347222222222222, 0.390625, 0.267094017094017, 0.473484848484848,
0.402462121212121, 0.762910798122066, 0, 0.966282894736842,
1.15049751243781, 1.38530927835052, 0.632440476190476, 1.07060185185185,
1.17559523809524, 1.19922969187675, 1.60761154855643, 0.96636546184739,
0.950727513227513, 0.78125, 1.48168103448276, 0.790229885057471,
1.09126984126984, 1.09027777777778, 1.16319444444444, 0.99862258953168,
1.05020491803279, 0.620318352059925, 2.05078125, 0.480769230769231,
1.16003787878788, 1.13095238095238, 0.51062091503268, 0.719026548672566,
1.16332116788321, 1.2609649122807, 0.806924882629108, 0.932017543859649,
0.78804347826087, 1.14889705882353, 0.863095238095238, 0.638686131386861,
1.55744336569579, 0.883620689655172, 1.05564876957494, 0.819020356234097
), PTS = c(5L, 2L, 15L, 0L, 0L, 0L, 2L, 0L, 0L, 0L, 6L, 9L,
0L, 17L, 13L, 16L, 10L, 16L, 18L, 11L, 21L, 6L, 12L, 10L,
19L, 20L, 7L, 28L, 21L, 10L, 18L, 10L, 23L, 4L, 7L, 16L,
0L, 7L, 20L, 26L, 22L, 10L, 7L, 19L, 14L, 6L, 9L, 15L, 23L,
9L), TPM = c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 2L, 0L, 0L, 2L, 2L, 0L, 0L, 1L, 0L, 1L, 2L, 1L, 0L,
1L, 3L, 3L, 1L, 1L, 1L, 5L, 0L, 1L, 2L, 0L, 1L, 2L, 3L, 4L,
0L, 0L, 3L, 2L, 0L, 1L, 3L, 3L, 1L), Ast = c(2L, 0L, 2L,
0L, 1L, 0L, 0L, 1L, 2L, 0L, 1L, 1L, 0L, 7L, 1L, 3L, 1L, 2L,
1L, 9L, 12L, 8L, 4L, 1L, 1L, 2L, 5L, 2L, 2L, 8L, 2L, 1L,
5L, 2L, 5L, 5L, 1L, 0L, 2L, 1L, 1L, 0L, 3L, 0L, 1L, 2L, 6L,
3L, 0L, 2L), Reb = c(0L, 0L, 3L, 0L, 1L, 0L, 1L, 0L, 0L,
1L, 2L, 2L, 0L, 5L, 10L, 7L, 6L, 0L, 10L, 9L, 4L, 1L, 7L,
3L, 13L, 2L, 0L, 3L, 4L, 4L, 7L, 1L, 5L, 0L, 4L, 2L, 1L,
6L, 9L, 9L, 2L, 4L, 7L, 6L, 10L, 8L, 12L, 7L, 9L, 5L), BLK = c(0L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 4L, 2L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 2L, 2L, 0L, 0L, 1L, 0L, 1L,
0L, 2L, 0L, 3L, 1L, 0L, 1L, 2L, 0L, 0L, 0L, 1L, 1L, 0L, 1L,
3L, 1L, 1L, 2L), STL = c(0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 3L, 0L, 1L, 0L, 1L,
2L, 0L, 0L, 1L, 2L, 1L, 2L, 0L, 0L, 2L, 1L, 0L, 0L, 2L, 2L,
0L, 0L, 1L, 1L, 0L, 4L, 0L, 0L, 0L, 1L, 1L, 2L), TO = c(1L,
0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 3L, 1L, 0L, 3L, 3L, 2L,
6L, 3L, 0L, 3L, 2L, 0L, 1L, 0L, 3L, 4L, 1L, 2L, 2L, 5L, 3L,
0L, 0L, 0L, 3L, 1L, 1L, 3L, 2L, 0L, 5L, 0L, 1L, 1L, 3L, 0L,
2L, 6L, 4L, 2L)), .Names = c("Player", "Team", "Pos", "game",
"Status", "Drafted", "Min", "FIC", "FP", "FPM", "PTS", "TPM",
"Ast", "Reb", "BLK", "STL", "TO"), row.names = c(NA, 50L), class = "data.frame")

Using dplyr with mtcars example data:
library(dplyr)
mtcars %>%
group_by(cyl) %>%
mutate(last3mean = mean(tail(mpg, 3)))
In your case, instead of cyl and mpg, use Player and the column to aggregate.
Using data.table, (suggested by #akrun):
data.table as.data.table(mtcar‌​s)[, .(last3mean = mean(tail(mpg,3))), by = cyl]

You can use rollmeanr from the zoo package with dplyr. This has the feature that not only the last three games of a player is averaged, but the last three game moving average is computed for each player. The code is as follows:
library(dplyr)
library(zoo)
avg.last.3 <- function (x) if (length(x) < 3) rep(NA, length(x)) else rollmeanr(x, 3, fill = NA) ## 1.
res <- df %>% group_by(Player) %>% arrange(game) %>% ## 2.
mutate(Avg.Pts=avg.last.3(PTS)) %>% ## 3.
ungroup() %>% arrange(Player,game) ## 4.
Notes:
Define a function avg.last.3 that applies the function rollmeanr with window length of 3. rollmeanr specifies align="right" to average the last three games, and we pad any result that does not have three days to average by NA. Note that the if condition in this function is needed so that:
length of x is at least the window length for rollmeanr as required by rollmeanr
avg.last.3 returns a vector that is the same length as its input as required by mutate.
First group_by the Player. Since I noted that the game column is not necessarily sorted for each Player, we sort by game in ascending order.
Use mutate to create a new column Avg.Pts resulting from applying the avg.last.3 function on a column, for example PTS.
Finally, ungroup and present the result sorted by Player followed by game
Of course, you can get the average of any number of columns by:
mutate(Avg.Pts=avg.last.3(PTS), Avg.Min=avg.last.3(Min), Avg.Ast=avg.last.3(Ast), ...)
The results averaging only the PTS column is given by (printing only the first six columns plus PTS and Avg.Pts):
print(res[,c(colnames(res)[1:6],"PTS","Avg.Pts")],n=50)
### A tibble: 50 x 8
## Player Team Pos game Status Drafted PTS Avg.Pts
## <fctr> <fctr> <fctr> <int> <int> <dbl> <int> <dbl>
##1 Adas Juskevicius LTU PG 4 0 0 5 NA
##2 Alex Abrines ESP SF 5 0 0 2 NA
##3 Andrew Bogut AUS C 2 1 53 9 NA
##4 Bojan Bogdanovic CRO PF 2 1 59 18 NA
##5 Bojan Bogdanovic CRO PF 4 1 61 28 NA
##6 Bojan Bogdanovic CRO PF 5 1 55 22 22.666667
##7 Boris Diaw FRA PF 1 1 51 9 NA
##8 Boris Diaw FRA PF 2 1 57 7 NA
##9 Boris Diaw FRA PF 3 1 68 11 9.000000
##10 Boris Diaw FRA PF 4 1 55 10 9.333333
##11 Brock Motum AUS PF 4 0 0 15 NA
##12 Dario Saric CRO SF 3 1 53 15 NA
##13 Dario Saric CRO SF 4 1 56 7 NA
##14 Dwight Lewis VEN SG 5 0 0 0 NA
##15 Facundo Campazzo ARG PG 2 1 60 10 NA
##16 Facundo Campazzo ARG PG 3 1 64 10 NA
##17 Facundo Campazzo ARG PG 5 0 59 10 10.000000
##18 Ike Diogu NGR PF 3 1 55 7 NA
##19 Jianlian Yi CHN SF 2 1 55 19 NA
##20 Jianlian Yi CHN PF 3 1 74 18 NA
##21 Jianlian Yi CHN SF 4 1 56 20 19.000000
##22 Jianlian Yi CHN SF 5 1 62 20 19.333333
##23 Jonas Maciulis LTU PF 2 1 61 21 NA
##24 Jonas Maciulis LTU SF 3 1 78 10 NA
##25 Jonas Maciulis LTU PF 4 1 58 4 11.666667
##26 Kevin Durant USA SF 2 1 57 16 NA
##27 Luis Scola ARG C 2 1 52 23 NA
##28 Luis Scola ARG PF 3 1 65 12 NA
##29 Luis Scola ARG C 4 1 54 14 16.333333
##30 Mantas Kalnietis LTU SG 2 1 68 21 NA
##31 Mantas Kalnietis LTU PG 3 1 85 17 NA
##32 Mantas Kalnietis LTU SG 4 1 77 16 18.000000
##33 Matt Dellavedova AUS PG 4 1 68 6 NA
##34 Miguel Marriaga VEN PF 3 0 0 0 NA
##35 Milos Teodosic SRB SG 5 0 62 7 NA
##36 Nikola Mirotic ESP PF 2 1 54 6 NA
##37 Pau Gasol ESP C 1 1 56 26 NA
##38 Pau Gasol ESP C 2 1 82 13 NA
##39 Pau Gasol ESP C 3 1 80 16 18.333333
##40 Pau Gasol ESP C 4 1 59 23 17.333333
##41 Pau Gasol ESP C 5 1 63 19 19.333333
##42 Rafa Luz BRZ SG 4 0 0 0 NA
##43 Ricky Rubio ESP PG 1 1 57 0 NA
##44 Roberto Acuna ARG C 3 1 0 2 NA
##45 Vaidas Kariniauskas LTU PF 1 0 0 0 NA
##46 Vaidas Kariniauskas LTU PF 2 0 0 0 NA
##47 Vaidas Kariniauskas LTU PF 3 0 0 0 0.000000
##48 Vaidas Kariniauskas LTU PF 4 0 0 6 2.000000
##49 Windi Graterol VEN C 5 0 0 9 NA
##50 Zeljko Sakic CRO SF 5 0 0 0 NA

First split the data frame up by player
playerDFs <- split(origdata, origdata["Player"])
Then subset the last 3 games
playerLast3 <- lapply(playerDFs, function(x) x[tail(order(x[["game"]]),3), ])
Finally get your means
vapply(playerLast3, colMeans, numeric(ncol(origdata)))

Related

Creating a new dataframe with averages from another dataframe with multiple conditions in R

I have fish count data and am trying to create a new dataframe using averages of the measurements based on conditions of two different columns. here is my data:
df <- structure(list(SITE = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L,
3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L,
2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L,
2L, 2L, 2L, 3L, 3L, 3L), .Label = c("1", "2", "3"), class = "factor"),
ZONE = structure(c(5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), .Label = c("CREST", "INNER_FLAT", "MID_FLAT",
"OUTER_FLAT", "SLOPE"), class = "factor"), C_TOTAL = c(44L,
7L, 20L, 14L, 0L, 4L, 2L, 3L, 1L, 8L, 28L, 24L, 31L, 12L,
33L, 6L, 16L, 33L, 75L, 21L, 60L, 81L, 37L, 89L, 21L, 35L,
71L, 5L, 2L, 0L, 0L, 10L, 23L, 0L, 5L, 11L, 3L, 1L, 5L, 0L,
0L, 8L, 7L, 6L, 42L), C_M2 = c(0.210465706, 0.029861994,
0.090324177, 0.066599319, 0, 0.022092452, 0.011750593, 0.015245519,
0.004710433, 0.033111594, 0.155094195, 0.110576495, 0.193659068,
0.059152822, 0.192379108, 0.047800772, 0.08917095, 0.141336411,
0.402538785, 0.130438337, 0.315206235, 0.460746849, 0.278643938,
0.467754275, 0.192830321, 0.119928472, 0.411502497, 0.015370489,
0.005150184, 0, 0, 0.034651441, 0.067824733, 0, 0.009805851,
0.034844309, 0.010614352, 0.004131048, 0.01850898, 0, 0,
0.029195413, 0.021409016, 0.030498145, 0.172406074), TRANS_A = c(209.0601875,
234.411677, 221.4246571, 210.2123593, 226.6158348, 181.0573136,
170.2041767, 196.7791332, 212.294701, 241.6072127, 180.5354478,
217.0443184, 160.0751279, 202.8643689, 171.536298, 125.5209863,
179.4306337, 233.485481, 186.3174499, 160.9956132, 190.3515643,
175.801528, 132.7859497, 190.2708425, 108.9040348, 291.8406241,
172.5384427, 325.2986863, 388.3356059, 303.1957479, 261.1574528,
288.5882879, 339.1093313, 239.1118021, 509.89965, 315.6899993,
282.6362022, 242.0693453, 270.1391425, 294.8864591, 321.2013381,
274.0156514, 326.9650539, 196.7332763, 243.6109069), SCARID_T = c(35L,
4L, 4L, 13L, 0L, 4L, 2L, 0L, 1L, 4L, 20L, 12L, 17L, 5L, 20L,
6L, 6L, 18L, 63L, 11L, 41L, 75L, 34L, 89L, 14L, 33L, 68L,
0L, 0L, 0L, 0L, 10L, 22L, 0L, 0L, 10L, 0L, 0L, 1L, 0L, 0L,
6L, 0L, 4L, 42L), ACAN_T = c(4L, 0L, 11L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 8L, 5L, 0L, 0L, 0L, 0L, 3L, 2L, 7L, 8L, 8L, 1L,
1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 5L, 0L, 0L,
0L, 1L, 0L, 0L, 2L, 0L, 0L, 0L), SIG_T = c(5L, 3L, 5L, 1L,
0L, 0L, 0L, 3L, 0L, 4L, 0L, 7L, 14L, 7L, 13L, 0L, 7L, 13L,
5L, 2L, 11L, 5L, 2L, 0L, 7L, 1L, 3L, 5L, 2L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 3L, 1L, 3L, 0L, 0L, 0L, 7L, 2L, 0L)), row.names = c(NA,
-45L), class = "data.frame")
I want to average all the measurements by each zone, but also according to site. So I want anew data frame where each site has one measurement for each zone.
Can anyone help me? Thanks!
library(dplyr)
df %>%
group_by(SITE, ZONE) %>%
summarise(
across(where(is.numeric), mean)
)
# A tibble: 15 x 8
# Groups: SITE [3]
SITE ZONE C_TOTAL C_M2 TRANS_A SCARID_T ACAN_T SIG_T
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 CREST 20 0.0996 213. 12 4.33 3.67
2 1 INNER_FLAT 3 0.0111 265. 0.333 0.333 2.33
3 1 MID_FLAT 2.33 0.00684 339. 0 0 2.33
4 1 OUTER_FLAT 52 0.283 179. 38.3 7.67 6
5 1 SLOPE 23.7 0.110 222. 14.3 5 4.33
6 2 CREST 25.3 0.148 178. 14 0 11.3
7 2 INNER_FLAT 2.67 0.00973 297. 2 0.667 0
8 2 MID_FLAT 11 0.0342 296. 10.7 0.333 0
9 2 OUTER_FLAT 69 0.402 166. 66 0.667 2.33
10 2 SLOPE 6 0.0296 206. 5.67 0 0.333
11 3 CREST 18.3 0.0928 179. 10 1.67 6.67
12 3 INNER_FLAT 18.3 0.0748 256. 15.3 0 3
13 3 MID_FLAT 5.33 0.0149 355. 3.33 1.67 0.333
14 3 OUTER_FLAT 42.3 0.241 191. 38.3 0.333 3.67
15 3 SLOPE 2 0.0106 193. 1 0 1

Loop over specific columns data and add the result as a new column in R

I have a dataframe df with following information:
df <- structure(list(Samples = structure(c(1L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 2L, 1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 2L, 1L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 2L, 1L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 2L), .Label = c("Sample1", "Sample10", "Sample2",
"Sample3", "Sample4", "Sample5", "Sample6", "Sample7", "Sample8",
"Sample9"), class = "factor"), patient.vital_status = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 1L), years = c(3.909589041, 1.457534247,
2.336986301, 5.010958904, 1.665753425, 1.81369863, 1.191780822,
4.687671233, 2.167123288, 1.95890411, 3.909589041, 1.457534247,
2.336986301, 5.010958904, 1.665753425, 1.81369863, 1.191780822,
4.687671233, 2.167123288, 1.95890411, 3.909589041, 1.457534247,
2.336986301, 5.010958904, 1.665753425, 1.81369863, 1.191780822,
4.687671233, 2.167123288, 1.95890411, 3.909589041, 1.457534247,
2.336986301, 5.010958904, 1.665753425, 1.81369863, 1.191780822,
4.687671233, 2.167123288, 1.95890411), Genes = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("A1BG", "A1CF", "A2M",
"A2ML1"), class = "factor"), value = c(0.034459012, 0.017698878,
0.023313851, 0.010456762, 0.032674019, 0.037561831, 0.03380681,
0, 0.019954956, 0.012392427, 0.835801613, 2.265192447, 2.431409095,
5.012117956, 2.139962802, 2.371946704, 4.555234385, 0.550293401,
0.924012327, 2.274642129, 92.85639578, 79.50897642, 23.72187602,
26.86025304, 32.80504253, 222.6449054, 71.78812505, 45.76371588,
29.93976676, 22.97515484, 0.03780441, 0.005825143, 0, 0.002867985,
0.011948708, 0.02060423, 0.004636111, 0.015903347, 0.005473063,
0.033988816)), class = "data.frame", row.names = c(NA, -40L))
I want to loop over the information based on the columns Genes and value and get a result. And again I want the result to be added to the dataframe df. The result will be with low or high.
I'm trying to do this with the following code, but it doesn't work:
genes <- as.character(unique(df$Genes))
library(survival)
library(survminer)
for(i in genes){
surv_rnaseq.cut <- surv_cutpoint(
df,
time = "years",
event = "patient.vital_status",
variables = c("Genes","value"))
df$cat <- surv_categorize(surv_rnaseq.cut)
}
Along with the above result I also wanted the summary for surv_rnaseq.cut for all the four genes with mentioning its name.
Any help please. thanq
An option would be to split by 'genes' (group_split), loop over the list, apply the functions and bind the list elements after creating the column
library(survminer)
library(survival)
library(dplyr)
library(purrr)
df %>%
group_split(Genes) %>%
map_dfr(~ surv_cutpoint(.x,
time = "years",
event = "patient.vital_status",
variables = c("Genes", "value")) %>%
surv_categorize %>%
pull(value) %>%
mutate(.x, cat = .))
# A tibble: 40 x 6
# Samples patient.vital_status years Genes value cat
# <fct> <int> <dbl> <fct> <dbl> <chr>
# 1 Sample1 0 3.91 A1BG 0.0345 high
# 2 Sample2 0 1.46 A1BG 0.0177 high
# 3 Sample3 0 2.34 A1BG 0.0233 high
# 4 Sample4 0 5.01 A1BG 0.0105 high
# 5 Sample5 0 1.67 A1BG 0.0327 high
# 6 Sample6 0 1.81 A1BG 0.0376 high
# 7 Sample7 0 1.19 A1BG 0.0338 high
# 8 Sample8 1 4.69 A1BG 0 low
# 9 Sample9 0 2.17 A1BG 0.0200 high
#10 Sample10 1 1.96 A1BG 0.0124 high
# … with 30 more rows

How can I identify specific string row number

My data looks like this
df<- structure(list(Main = structure(c(5L, 3L, 1L, 2L, 4L, 4L, 2L,
1L, 5L, 2L, 5L, 4L, 5L, 2L), .Label = c("IsMainbody", "IsMainbodyCandidate",
"IsMainbodyRejected", "Main", "None"), class = "factor"), Group.IDs = c(52L,
NA, 2L, 12L, 38L, 38L, 6L, 3L, NA, 49L, 20L, 38L, 54L, 85L),
X..Number1 = c(12L, 6L, 1L, 5L, 1L, 1L, 1L, 1L, 17L, 1L,
4L, 1L, 1L, 4L), X..No = c(20L, 62L, 2L, 16L, 3L, 3L, 1L,
3L, 32L, 3L, 36L, 3L, 1L, 20L), X..Unique.N = c(0L, 0L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), class = "data.frame", row.names = c(NA,
-14L))
I am trying to find the row number of for specific strings.
Based on main column, I want to find this how many of my sample has "Main" , how many have "IsmainbodyCandidate" and how many are "IsMainbodyRejected"
Then I want to make a new dataset that only consists of Main and Ismainbody and Ismainbodycandidates like below .
Main Group IDs # Number1 # No # Unique N
IsMainbody. 2 1 2 1
IsMainbodyCandidate 12 5 16 0
Main 38 1 3 0
Main 38 1 3 0
IsMainbodyCandidate 6 1 1 0
IsMainbody 3 1 3 0
IsMainbodyCandidate 49 1 3 0
IsMainbodyCandidate 85 4 20 0
# count by main
table(df$Main)
# new dataframe without "None"
df[df$Main != "None", ]
# or more explicitly
df[df$Main %in% c("Main", "IsMainbody", "IsMainbodyCandidate"), ]

svm predict function error in R

I'm trying to run an svm on the titanic data set but I've run into an issue with the predict function.
svm.iceberg <- svm(Survived ~ Pclass+Sex+SibSp+Parch+Ticket+Fare+Cabin+Embarked+Surname+Age_Range, data = train.iceberg)
svm.prediction <- predict(svm.iceberg, newdata = test.iceberg) #prediction
I'm receiving the following error:
Error in names(ret2) <- rowns :'names' attribute [418] must be the same length as the vector [0]
I think the error is a result of when I made the factor levels the same for the train and test data for a few variables (here is the code I used):
test.iceberg$Name <- factor(test.iceberg$Name,levels =levels(train.iceberg$Name))
test.iceberg$Ticket <- factor(test.iceberg$Ticket,levels =levels(train.iceberg$Ticket))
test.iceberg$Cabin <- factor(test.iceberg$Cabin,levels =levels(train.iceberg$Cabin))
test.iceberg$Surname <- factor(test.iceberg$Ticket,levels =levels(train.iceberg$Surname))
Here is the train data frame:
str(train.iceberg)
'data.frame': 891 obs. of 14 variables:
$ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
$ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
$ Pclass : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 3 1 3 3 2 ...
$ Name : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 354 273 16 555 516 625 413 577 ...
$ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
$ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
$ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
$ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
$ Ticket : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
$ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
$ Cabin : Factor w/ 147 levels "A10","A14","A16",..: NA 82 NA 56 NA NA 130 NA NA NA ...
$ Embarked : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
$ Surname : Factor w/ 667 levels "Abbing","Abbott",..: 74 137 252 199 12 410 379 464 293 427 ...
$ Age_Range : num 4 4 4 4 4 NA 4 2 4 3 ...
This caused a lot of NA's in the test data frame as can be seen below:
> str(test.iceberg) #
'data.frame': 418 obs. of 14 variables:
$ PassengerId: int 892 893 894 895 896 897 898 899 900 901 ...
$ Survived : int NA NA NA NA NA NA NA NA NA NA ...
$ Pclass : Factor w/ 3 levels "1","2","3": 3 3 2 3 3 3 3 2 3 3 ...
$ Name : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 434 NA NA NA NA NA 178 NA NA NA ...
$ Sex : Factor w/ 2 levels "female","male": 2 1 2 2 1 2 1 2 1 2 ...
$ Age : num 34 47 62 27 22 14 30 26 18 21 ...
$ SibSp : int 0 1 0 0 1 0 0 1 0 2 ...
$ Parch : int 0 0 0 0 1 0 0 1 0 0 ...
$ Ticket : Factor w/ 681 levels "110152","110413",..: NA NA NA NA 252 NA NA 159 NA 520 ...
$ Fare : num 7.83 7 9.69 8.66 12.29 ...
$ Cabin : Factor w/ 147 levels "A10","A14","A16",..: NA NA NA NA NA NA NA NA NA NA ...
$ Embarked : Factor w/ 3 levels "C","Q","S": 2 3 2 3 3 3 2 3 1 3 ...
$ Surname : Factor w/ 667 levels "Abbing","Abbott",..: NA NA NA NA NA NA NA NA NA NA ...
$ Age_Range : num 5 6 8 5 5 3 5 5 3 5 ...
Here is a sample of the data:
dput((droplevels(head(train.iceberg,100))))
structure(list(PassengerId = 1:100, Survived = c(0L, 1L, 1L,
1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 0L,
1L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L,
0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L,
0L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 1L,
0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 1L,
0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L,
0L), Pclass = structure(c(3L, 1L, 3L, 1L, 3L, 3L, 1L, 3L, 3L,
2L, 3L, 1L, 3L, 3L, 3L, 2L, 3L, 2L, 3L, 3L, 2L, 2L, 3L, 1L, 3L,
3L, 3L, 1L, 3L, 3L, 1L, 1L, 3L, 2L, 1L, 1L, 3L, 3L, 3L, 3L, 3L,
2L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 2L, 1L, 1L, 2L,
3L, 2L, 3L, 3L, 1L, 1L, 3L, 1L, 3L, 2L, 3L, 3L, 3L, 2L, 3L, 2L,
3L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 1L, 2L, 3L, 3L, 3L, 1L,
3L, 3L, 3L, 1L, 3L, 3L, 3L, 1L, 1L, 2L, 2L), .Label = c("1",
"2", "3"), class = "factor"), Name = structure(c(12L, 22L, 41L,
32L, 2L, 61L, 56L, 72L, 48L, 64L, 79L, 11L, 80L, 4L, 95L, 42L,
75L, 99L, 94L, 55L, 33L, 9L, 58L, 86L, 73L, 7L, 27L, 31L, 70L,
90L, 92L, 87L, 34L, 98L, 59L, 43L, 54L, 14L, 93L, 65L, 1L, 91L,
51L, 52L, 24L, 76L, 53L, 69L, 78L, 6L, 74L, 66L, 39L, 28L, 71L,
100L, 77L, 67L, 97L, 36L, 83L, 45L, 40L, 84L, 89L, 62L, 68L,
21L, 3L, 50L, 47L, 37L, 44L, 19L, 10L, 60L, 88L, 63L, 13L, 26L,
96L, 81L, 57L, 15L, 46L, 8L, 29L, 85L, 30L, 16L, 18L, 5L, 17L,
23L, 20L, 82L, 35L, 38L, 25L, 49L), .Label = c("Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",
"Allen, Mr. William Henry", "Andersson, Miss. Erna Alexandra",
"Andersson, Mr. Anders Johan", "Andreasson, Mr. Paul Edvin",
"Arnold-Franchi, Mrs. Josef (Josefine Franchi)", "Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)",
"Backstrom, Mrs. Karl Alfred (Maria Mathilda Gustafsson)", "Beesley, Mr. Lawrence",
"Bing, Mr. Lee", "Bonnell, Miss. Elizabeth", "Braund, Mr. Owen Harris",
"Caldwell, Master. Alden Gates", "Cann, Mr. Ernest Charles",
"Carrau, Mr. Francisco M", "Celotti, Mr. Francesco", "Chaffee, Mr. Herbert Fuller",
"Christmann, Mr. Emil", "Chronopoulos, Mr. Apostolos", "Coxon, Mr. Daniel",
"Crease, Mr. Ernest James", "Cumings, Mrs. John Bradley (Florence Briggs Thayer)",
"Dean, Mr. Bertram Frank", "Devaney, Miss. Margaret Delia", "Doling, Mrs. John T (Ada Julia Bone)",
"Dowdell, Miss. Elizabeth", "Emir, Mr. Farred Chehab", "Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkinson)",
"Ford, Mr. William Neal", "Fortune, Miss. Mabel Helen", "Fortune, Mr. Charles Alexander",
"Futrelle, Mrs. Jacques Heath (Lily May Peel)", "Fynney, Mr. Joseph J",
"Glynn, Miss. Mary Agatha", "Goldschmidt, Mr. George B", "Goodwin, Master. William Frederick",
"Goodwin, Miss. Lillian Amy", "Greenfield, Mr. William Bertram",
"Harper, Mrs. Henry Sleeper (Myna Haxtun)", "Harris, Mr. Henry Birkhardt",
"Heikkinen, Miss. Laina", "Hewlett, Mrs. (Mary D Kingcome) ",
"Holverson, Mr. Alexander Oskar", "Hood, Mr. Ambrose Jr", "Icard, Miss. Amelie",
"Ilett, Miss. Bertha", "Jenkin, Mr. Stephen Curnow", "Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",
"Kantor, Mr. Sinai", "Kink, Mr. Vincenz", "Kraeff, Mr. Theodor",
"Laroche, Miss. Simonne Marie Anne Andree", "Lennon, Mr. Denis",
"Mamee, Mr. Hanna", "Masselmani, Mrs. Fatima", "McCarthy, Mr. Timothy J",
"McDermott, Miss. Brigdet Delia", "McGowan, Miss. Anna \"Annie\"",
"Meyer, Mr. Edgar Joseph", "Moen, Mr. Sigurd Hansen", "Moran, Mr. James",
"Moubarek, Master. Gerios", "Moutal, Mr. Rahamin Haim", "Nasser, Mrs. Nicholas (Adele Achem)",
"Nicola-Yarred, Miss. Jamila", "Nosworthy, Mr. Richard Cater",
"Novel, Mr. Mansouer", "Nye, Mrs. (Elizabeth Ramell)", "O'Driscoll, Miss. Bridget",
"O'Dwyer, Miss. Ellen \"Nellie\"", "Ostby, Mr. Engelhart Cornelius",
"Palsson, Master. Gosta Leonard", "Palsson, Miss. Torborg Danira",
"Panula, Master. Juha Niilo", "Rice, Master. Eugene", "Rogers, Mr. William John",
"Rugg, Miss. Emily", "Samaan, Mr. Youssef", "Sandstrom, Miss. Marguerite Rut",
"Saundercock, Mr. William Henry", "Sheerlinck, Mr. Jan Baptist",
"Shorney, Mr. Charles Joseph", "Sirayanian, Mr. Orsen", "Skoog, Master. Harald",
"Slocovski, Mr. Selman Francis", "Sloper, Mr. William Thompson",
"Spencer, Mrs. William Augustus (Marie Eugenie)", "Staneff, Mr. Ivan",
"Stewart, Mr. Albert A", "Todoroff, Mr. Lalio", "Turpin, Mrs. William John Robert (Dorothy Ann Wonnacott)",
"Uruchurtu, Don. Manuel E", "Vander Planke, Miss. Augusta Maria",
"Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)", "Vestrom, Miss. Hulda Amanda Adolfina",
"Waelens, Mr. Achille", "West, Miss. Constance Mirium", "Wheadon, Mr. Edward H",
"Williams, Mr. Charles Eugene", "Woolner, Mr. Hugh"), class = "factor"),
Sex = structure(c(2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L,
1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L,
2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L,
2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L,
1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L
), .Label = c("female", "male"), class = "factor"), Age = c(22,
38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14, 55,
2, NA, 31, NA, 35, 34, 15, 28, 8, 38, NA, 19, NA, NA, 40,
NA, NA, 66, 28, 42, NA, 21, 18, 14, 40, 27, NA, 3, 19, NA,
NA, NA, NA, 18, 7, 21, 49, 29, 65, NA, 21, 28, 5, 11, 22,
38, 45, 4, NA, NA, 29, 19, 17, 26, 32, 16, 21, 26, 32, 25,
NA, NA, 1, 30, 22, 29, NA, 28, 17, 33, 16, NA, 23, 24, 29,
20, 46, 26, 59, NA, 71, 23, 34, 34), SibSp = c(1L, 1L, 0L,
1L, 0L, 0L, 0L, 3L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 4L, 0L,
1L, 0L, 0L, 0L, 0L, 0L, 3L, 1L, 0L, 3L, 0L, 0L, 0L, 1L, 0L,
0L, 1L, 1L, 0L, 0L, 2L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 0L,
2L, 1L, 4L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 5L, 0L, 0L, 1L,
3L, 0L, 1L, 0L, 0L, 4L, 2L, 0L, 5L, 0L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 3L, 1L, 0L, 3L, 0L, 0L, 0L, 1L,
1L, 0L, 0L, 0L, 0L, 0L, 1L), Parch = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 2L, 0L, 1L, 0L, 0L, 5L, 0L, 0L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 5L, 0L, 2L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 2L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 2L, 2L, 0L, 0L, 0L, 2L, 0L,
1L, 0L, 0L, 2L, 0L, 0L, 2L, 0L, 0L, 0L, 0L, 0L, 0L, 2L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 3L, 0L, 2L, 0L, 0L, 0L, 0L, 2L, 0L,
0L, 0L, 1L, 1L, 0L), Ticket = structure(c(71L, 82L, 95L,
7L, 64L, 36L, 11L, 58L, 52L, 15L, 88L, 4L, 72L, 49L, 59L,
20L, 67L, 18L, 44L, 23L, 16L, 19L, 37L, 5L, 58L, 48L, 22L,
13L, 40L, 55L, 83L, 80L, 41L, 74L, 84L, 6L, 28L, 69L, 45L,
24L, 68L, 8L, 57L, 92L, 39L, 89L, 63L, 9L, 26L, 56L, 34L,
70L, 81L, 31L, 2L, 12L, 76L, 30L, 78L, 79L, 27L, 3L, 62L,
50L, 85L, 25L, 75L, 91L, 33L, 35L, 77L, 79L, 90L, 29L, 10L,
53L, 54L, 65L, 21L, 61L, 46L, 47L, 38L, 1L, 93L, 32L, 96L,
94L, 13L, 42L, 43L, 51L, 97L, 73L, 60L, 66L, 86L, 87L, 14L,
17L), .Label = c("113059", "113509", "113572", "113783",
"113788", "113789", "113803", "11668", "14311", "1601", "17463",
"19947", "19950", "231919", "237736", "239865", "244367",
"244373", "248698", "248706", "248738", "2631", "2649", "2651",
"2661", "2662", "2669", "2677", "2680", "2697", "2926", "3101278",
"3101281", "3101295", "315151", "330877", "330923", "330932",
"330958", "330959", "335677", "343275", "343276", "345763",
"345764", "345767", "345779", "347077", "347082", "347088",
"347466", "347742", "348123", "349208", "349216", "349237",
"349253", "349909", "350406", "364500", "364516", "36973",
"370371", "373450", "374746", "374910", "382652", "7546",
"A./5. 2152", "A/4. 39886", "A/5 21171", "A/5. 2151", "C.A. 2315",
"C.A. 24579", "C.A. 29395", "C.A. 31026", "C.A. 33111", "C.A. 34651",
"CA 2144", "PC 17569", "PC 17572", "PC 17599", "PC 17601",
"PC 17604", "PC 17605", "PC 17754", "PC 17759", "PP 9549",
"S.C./A.4. 23567", "S.O.C. 14879", "S.P. 3464", "SC/Paris 2123",
"SO/C 14885", "SOTON/OQ 392086", "STON/O2. 3101282", "W./C. 6608",
"W.E.P. 5734"), class = "factor"), Fare = c(7.25, 71.2833,
7.925, 53.1, 8.05, 8.4583, 51.8625, 21.075, 11.1333, 30.0708,
16.7, 26.55, 8.05, 31.275, 7.8542, 16, 29.125, 13, 18, 7.225,
26, 13, 8.0292, 35.5, 21.075, 31.3875, 7.225, 263, 7.8792,
7.8958, 27.7208, 146.5208, 7.75, 10.5, 82.1708, 52, 7.2292,
8.05, 18, 11.2417, 9.475, 21, 7.8958, 41.5792, 7.8792, 8.05,
15.5, 7.75, 21.6792, 17.8, 39.6875, 7.8, 76.7292, 26, 61.9792,
35.5, 10.5, 7.2292, 27.75, 46.9, 7.2292, 80, 83.475, 27.9,
27.7208, 15.2458, 10.5, 8.1583, 7.925, 8.6625, 10.5, 46.9,
73.5, 14.4542, 56.4958, 7.65, 7.8958, 8.05, 29, 12.475, 9,
9.5, 7.7875, 47.1, 10.5, 15.85, 34.375, 8.05, 263, 8.05,
8.05, 7.8542, 61.175, 20.575, 7.25, 8.05, 34.6542, 63.3583,
23, 26), Cabin = structure(c(NA, 11L, NA, 7L, NA, NA, 16L,
NA, NA, NA, 19L, 6L, NA, NA, NA, NA, NA, NA, NA, NA, NA,
14L, NA, 2L, NA, NA, NA, 8L, NA, NA, NA, 5L, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 13L, NA, 4L, 9L, NA, NA, NA, NA, NA, 3L, 10L, NA,
NA, NA, 18L, NA, NA, NA, NA, NA, NA, NA, NA, 17L, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 8L, NA, NA, NA, 15L,
NA, NA, NA, 1L, 12L, NA, NA), .Label = c("A5", "A6", "B28",
"B30", "B78", "C103", "C123", "C23 C25 C27", "C52", "C83",
"C85", "D10 D12", "D33", "D56", "E31", "E46", "F G73", "F33",
"G6"), class = "factor"), Embarked = structure(c(3L, 1L,
3L, 3L, 3L, 2L, 3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 2L,
3L, 3L, 1L, 3L, 3L, 2L, 3L, 3L, 3L, 1L, 3L, 2L, 3L, 1L, 1L,
2L, 3L, 1L, 3L, 1L, 3L, 3L, 1L, 3L, 3L, 1L, 1L, 2L, 3L, 2L,
2L, 1L, 3L, 3L, 3L, 1L, 3L, 1L, 3L, 3L, 1L, 3L, 3L, 1L, NA,
3L, 3L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 1L, 1L, 3L, 3L), .Label = c("C", "Q", "S"
), class = "factor"), Surname = structure(c(11L, 21L, 38L,
30L, 2L, 58L, 53L, 69L, 45L, 61L, 75L, 10L, 76L, 3L, 90L,
39L, 71L, 94L, 89L, 52L, 31L, 8L, 55L, 82L, 69L, 6L, 26L,
29L, 67L, 86L, 88L, 83L, 32L, 93L, 56L, 40L, 51L, 13L, 89L,
62L, 1L, 87L, 48L, 49L, 23L, 72L, 50L, 66L, 74L, 5L, 70L,
63L, 36L, 27L, 68L, 95L, 73L, 64L, 92L, 34L, 79L, 42L, 37L,
80L, 85L, 59L, 65L, 20L, 3L, 47L, 44L, 34L, 41L, 18L, 9L,
57L, 84L, 60L, 12L, 25L, 91L, 77L, 54L, 14L, 43L, 7L, 28L,
81L, 29L, 15L, 17L, 4L, 16L, 22L, 19L, 78L, 33L, 35L, 24L,
46L), .Label = c("Ahlin", "Allen", "Andersson", "Andreasson",
"Arnold-Franchi", "Asplund", "Backstrom", "Beesley", "Bing",
"Bonnell", "Braund", "Caldwell", "Cann", "Carrau", "Celotti",
"Chaffee", "Christmann", "Chronopoulos", "Coxon", "Crease",
"Cumings", "Dean", "Devaney", "Doling", "Dowdell", "Emir",
"Faunthorpe", "Ford", "Fortune", "Futrelle", "Fynney", "Glynn",
"Goldschmidt", "Goodwin", "Greenfield", "Harper", "Harris",
"Heikkinen", "Hewlett", "Holverson", "Hood", "Icard", "Ilett",
"Jenkin", "Johnson", "Kantor", "Kink", "Kraeff", "Laroche",
"Lennon", "Mamee", "Masselmani", "McCarthy", "McDermott",
"McGowan", "Meyer", "Moen", "Moran", "Moubarek", "Moutal",
"Nasser", "Nicola-Yarred", "Nosworthy", "Novel", "Nye", "O'Driscoll",
"O'Dwyer", "Ostby", "Palsson", "Panula", "Rice", "Rogers",
"Rugg", "Samaan", "Sandstrom", "Saundercock", "Sheerlinck",
"Shorney", "Sirayanian", "Skoog", "Slocovski", "Sloper",
"Spencer", "Staneff", "Stewart", "Todoroff", "Turpin", "Uruchurtu",
"Vander Planke", "Vestrom", "Waelens", "West", "Wheadon",
"Williams", "Woolner"), class = "factor"), Age_Range = c(4,
4, 4, 4, 4, NA, 4, 2, 4, 3, 2, 4, 4, 4, 3, 4, 2, NA, 4, NA,
4, 4, 3, 4, 2, 4, NA, 3, NA, NA, 6, NA, NA, 8, 5, 6, NA,
5, 3, 3, 6, 5, NA, 2, 3, NA, NA, NA, NA, 3, 2, 5, 6, 5, 8,
NA, 5, 5, 2, 2, 5, 5, 6, 2, NA, NA, 5, 3, 3, 5, 5, 3, 5,
5, 5, 5, NA, NA, 1, 5, 5, 5, NA, 5, 3, 5, 3, NA, 5, 5, 5,
5, 6, 5, 7, NA, 9, 5, 5, 5)), .Names = c("PassengerId", "Survived",
"Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare",
"Cabin", "Embarked", "Surname", "Age_Range"), row.names = c(NA,
100L), class = "data.frame")

Ggplot: comparing multiple continuous variable with one binary variable

I am tring to plot(boxplots) multiple continuous variables (about 20 variables) with one binary outcome variable (either 0 or 1).
data:
ID outcome var1 var2 var3 var4 var5
1 0 62 2.01 13 1.94 8
2 0 150 4.32 9 99 6
3 0 18 1.86 0.6 99 22
4 0 60 4.08 3 -99 6
5 1 20 1.96 1 99 14
6 1 100 1.64 19 -99 3
my code:
tmp <- melt(data, id.vars=c("ID", "outcome"))
p <- ggplot(data = tmp, aes(x=outcome, y= value)) +
geom_boxplot(aes(fill=Label))
p + facet_wrap( ~ variable, scales="free")
this code shows the following error:
Error in layout_base(data, vars, drop = drop) : At least one layer must contain all variables used for facetting
Any help would be greatly appreciated.
There are a couple of problems here.
1) You don't have a variable called Label.
2) outcome is a continuous variable.
Removing Label and making outcome into a factor, the code works
ggplot(data = tmp, aes(x=as.factor(outcome), y= value)) +
geom_boxplot() +
facet_wrap( ~ variable, scales="free")
data:
tmp <- structure(list(ID = c(1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L,
5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L,
3L, 4L, 5L, 6L), outcome = c(0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L,
0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L,
0L, 0L, 0L, 0L, 1L, 1L), variable = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L,
4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L), .Label = c("var1", "var2",
"var3", "var4", "var5"), class = "factor"), value = c(62, 150,
18, 60, 20, 100, 2.01, 4.32, 1.86, 4.08, 1.96, 1.64, 13, 9, 0.6,
3, 1, 19, 1.94, 99, 99, -99, 99, -99, 8, 6, 22, 6, 14, 3)), row.names = c(NA,
-30L), .Names = c("ID", "outcome", "variable", "value"), class = "data.frame")

Resources