Ggplot: comparing multiple continuous variable with one binary variable - r

I am tring to plot(boxplots) multiple continuous variables (about 20 variables) with one binary outcome variable (either 0 or 1).
data:
ID outcome var1 var2 var3 var4 var5
1 0 62 2.01 13 1.94 8
2 0 150 4.32 9 99 6
3 0 18 1.86 0.6 99 22
4 0 60 4.08 3 -99 6
5 1 20 1.96 1 99 14
6 1 100 1.64 19 -99 3
my code:
tmp <- melt(data, id.vars=c("ID", "outcome"))
p <- ggplot(data = tmp, aes(x=outcome, y= value)) +
geom_boxplot(aes(fill=Label))
p + facet_wrap( ~ variable, scales="free")
this code shows the following error:
Error in layout_base(data, vars, drop = drop) : At least one layer must contain all variables used for facetting
Any help would be greatly appreciated.

There are a couple of problems here.
1) You don't have a variable called Label.
2) outcome is a continuous variable.
Removing Label and making outcome into a factor, the code works
ggplot(data = tmp, aes(x=as.factor(outcome), y= value)) +
geom_boxplot() +
facet_wrap( ~ variable, scales="free")
data:
tmp <- structure(list(ID = c(1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L,
5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L,
3L, 4L, 5L, 6L), outcome = c(0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L,
0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L,
0L, 0L, 0L, 0L, 1L, 1L), variable = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L,
4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L), .Label = c("var1", "var2",
"var3", "var4", "var5"), class = "factor"), value = c(62, 150,
18, 60, 20, 100, 2.01, 4.32, 1.86, 4.08, 1.96, 1.64, 13, 9, 0.6,
3, 1, 19, 1.94, 99, 99, -99, 99, -99, 8, 6, 22, 6, 14, 3)), row.names = c(NA,
-30L), .Names = c("ID", "outcome", "variable", "value"), class = "data.frame")

Related

How to merge plots from metafor and ggplot?

Please, find My Data q and p below
I want to merge a Forest Plot computed with library(metafor) with two ggplots
I have tried multiple approaches with gridArrange and par(), but without any success. I know there are several threads on this topic, but none of them addresses the combination of ggplot and metafor
The merged plot should be arranged like this:
The PLOT 1 and PLOT 2 is based on the following data w
w <- structure(list(WHO = c(1L, 3L, 2L, 2L, 2L, 3L, 2L, 3L, 1L, 2L,
3L, 3L, 3L, 1L, 2L, 1L, 2L, 3L, 3L, 1L, 2L, 1L, 3L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 4L, 4L, 1L, 4L, 1L, 2L, 1L, 4L, 1L, 4L, 4L, 4L,
4L, 3L, 3L, 4L, 4L, 4L, 4L, 1L, 4L, 4L, 2L, 1L, 2L, 2L, 4L, 4L,
4L, 2L, 4L, 1L, 4L, 4L, 2L, 4L, 4L, 3L, 4L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 3L, 2L, 2L, 3L, 3L, 3L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L,
3L, 4L, 3L, 4L, 3L), response = c(0L, 1L, 0L, 0L, 0L, 1L, 1L,
1L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 1L, NA, 1L), Death = c(0L, 1L, 1L, 0L, 0L,
1L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L,
1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, NA, 1L)), class = "data.frame", row.names = c(NA,
-111L))
PLOT 1
Is based on this script.
yaks <- c(0,2,4,6,8,10,12,14,16,18)
j <- ggplot() +
geom_bar(mapping = aes(x = w$WHO[w$response==1]), stat = "count", alpha=0.2, colour="#1C73C2", fill="#ECF0F9") +
scale_x_discrete(name="", drop = FALSE, labels =c("WHO-I\nn=32","WHO-II\nn=23","WHO-III\nn=16", "Unknown\nn=5")) +
theme(axis.text.x = element_text(color = "grey20", size = 11))
j <-
j %+% scale_y_continuous(name = "Progression", breaks=yaks, sec.axis = sec_axis(~ . * 1 , name = "Progression rate per 100 person-yrs", breaks=yaks, labels=c("0","40","80","120","160","200","240","280","320","360"))) +
coord_cartesian(ylim=c(0, 18)) +
theme(axis.text.y.right = element_text(color = "red", size = 11), axis.title.y=element_text(size=14,face="bold", margin = margin(t = 0, r = 20, b = 0, l = 0))) +
theme(axis.text.y = element_text(color = "#1C73C2", size = 11), axis.title.y.right=element_text(size=14,face="bold", margin = margin(t = 0, r = 0, b = 0, l = 20)))
j +
geom_point(mapping = aes(x = 1, y = 0.2677376), size=10, alpha=0.7, shape=18, colour="red") +
geom_point(mapping = aes(x = 2, y = 3.2870709), size=10, shape=18,alpha=0.7, colour="red") +
geom_point(mapping = aes(x = 3, y = 16.98), size=10, shape=18,alpha=0.7, colour="red") +
annotate("text", x = c(1,2,3, 4) , y = c(-0.5, 2.2870709, 15.98, 0.5), label = c("5.4","65.7","339.6", "0 events"), col="red", fontface=2, cex=4)
And looks like this:
Whereas PLOT 2 is based on this script
j <- ggplot() +
geom_bar(mapping = aes(x = w$WHO[w$Death==1]), stat = "count", alpha=0.2, colour="#1C73C2", fill="#ECF0F9")
yaks <- c(0,2,4,6,8,10,12,14,16,18)
j <-
j %+% scale_y_continuous(name = "Deaths", breaks=yaks, sec.axis = sec_axis(~ . * 1 , name = "Mortality rate per 100 person-yrs", breaks=yaks, labels=c("0","5","10","15","20","25","30","35","40","45"))) +
coord_cartesian(ylim=c(0, 18)) +
theme(axis.text.y.right = element_text(color = "red", size = 11), axis.title.y=element_text(size=14,face="bold",margin = margin(t = 0, r = 20, b = 0, l = 0))) +
theme(axis.text.y = element_text(color = "#1C73C2", size = 11), axis.title.y.right=element_text(size=14,face="bold",margin = margin(t = 0, r = 0, b = 0, l = 20)))
j <-
j +
geom_point(mapping = aes(x = 1, y = 3.329993), size=10,alpha=0.7, shape=18, colour="red") +
geom_point(mapping = aes(x = 2, y = 12.424504), size=10,alpha=0.7, shape=18, colour="red") +
geom_point(mapping = aes(x = 3, y = 17.23519), size=10, alpha=0.7,shape=18, colour="red") +
geom_point(mapping = aes(x = 4, y = 4.549763), size=10, alpha=0.7, shape=18, colour="red") +
annotate("text", x = c(1,2,3,4) , y = c(2.329993, 11.424504, 16.23519,3.549763 ), label = c("8.3","31.1","43.1","11.4"), col="red", fontface=2, cex=4)
j + scale_x_continuous(name="", breaks = c(1,2,3,4), labels =c("WHO-I\nn=37","WHO-II\nn=29","WHO-III\nn=19","Unknown\nn=25")) +
theme(axis.text.x = element_text(color = "grey20", size = 11))
And looks like this:
Finally, PLOT 3 is a Forest Plot from metafor
The Forest Plot looks like this:
With the following data
q <- structure(list(study = structure(c(2L, 4L, 7L, 3L, 5L, 1L, 8L,
6L, 9L), .Label = c("WHO-I versus Unknown ", "WHO-I versus WHO-II",
"WHO-I versus WHO-II ", "WHO-I versus WHO-III", "WHO-I versus WHO-III ",
"WHO-II versus Unknown", "WHO-II versus WHO-III", "WHO-II versus WHO-III ",
"WHO-III versus Unknown"), class = "factor"), order = 9:1, x1i = c(4L,
4L, 15L, 9L, 9L, 9L, 15L, 15L, 12L), n1i = c(32L, 32L, 23L, 37L,
37L, 37L, 29L, 29L, 19L), t1i = c(74.7, 74.7, 22.8, 108.1, 108.1,
108.1, 48.3, 48.3, 27.9), x2i = c(15L, 15L, 15L, 15L, 12L, 9L,
12L, 9L, 9L), n2i = c(23L, 16L, 16L, 29L, 19L, 25L, 19L, 25L,
25L), t2i = c(22.8, 4.4, 4.4, 48.3, 27.9, 79.1, 27.9, 79.1, 79.1
), ir1 = c(5.4, 5.4, 65.7, 8.3, 8.3, 8.3, 31.1, 31.1, 43.1),
ir2 = c(65.7, 339.6, 339.6, 31.1, 43.1, 11.4, 43.1, 11.4,
11.4)), class = "data.frame", row.names = c(NA, -9L))
And script
q <- escalc(measure="IRR", x1i=x1i, t1i=t1i, x2i=x2i, t2i=t2i, data=q)
q1 <- rma(yi, vi, data=q, slab=paste(study, sep=", "), method = "REML")
## Forest
forest(q1, xlim=c(-27,8), atransf=exp, showweights = FALSE, psize = 1.2, refline=log(1),
cex=0.95, ylim=c(0.1, 17), font=1, col="white", border="white", order=order(q$order),
ilab=cbind(q$x1i, q$t1i, q$ir1, q$x2i, q$t2i,q$ir2),
ilab.xpos=c(-19.3,-17,-15,-12.3,-10,-8),
rows=c(2:7,11:13),xlab="Rate ratios", mlab="")
# Headlines
text(c(-19,-16.8,-15,-12,-9.8,-8) ,15.7,font=1, cex=0.9, c("Events\n per total", "Person-\nyrs", "IR", "Events\n per total", "Person-\nyrs","IR"))
text(c(-18.75,-18.75,-18.65) ,c(13,12,11),font=1, cex=0.94, c("/ 32", "/ 32", " / 23"))
text(c(-18.75,-18.75,-18.75) ,c(7,6,5),font=1, cex=0.94, c("/ 37", "/ 37", "/ 37"))
text(c(-18.65,-18.65,-18.65) ,c(4,3,2),font=1, cex=0.94, c(" / 29", " / 29", " / 19"))
text(c(-11.65,-11.65,-11.65) ,c(13,12,11),font=1, cex=0.94, c(" /23", " /16", " /16"))
text(c(-11.65,-11.65,-11.75) ,c(7,6,5),font=1, cex=0.94, c(" /29", "/19", " /25"))
text(c(-11.65,-11.75,-11.75) ,c(4,3,2),font=1, cex=0.94, c("/19", " / 25", " / 25"))
text(8 ,15.7,font=1, "Rate ratio [95% CI]", pos=2, cex=1)
text(-27 ,c(14,8),font=2, c("Progression rates","Mortality rates"), pos=4, cex=0.9)
text(-27 ,c(1,10),font=1, c("Cohort: 110 patients included","Cohort: 76 patients included"), pos=4, cex=0.8)

Loop over specific columns data and add the result as a new column in R

I have a dataframe df with following information:
df <- structure(list(Samples = structure(c(1L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 2L, 1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 2L, 1L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 2L, 1L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 2L), .Label = c("Sample1", "Sample10", "Sample2",
"Sample3", "Sample4", "Sample5", "Sample6", "Sample7", "Sample8",
"Sample9"), class = "factor"), patient.vital_status = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 1L), years = c(3.909589041, 1.457534247,
2.336986301, 5.010958904, 1.665753425, 1.81369863, 1.191780822,
4.687671233, 2.167123288, 1.95890411, 3.909589041, 1.457534247,
2.336986301, 5.010958904, 1.665753425, 1.81369863, 1.191780822,
4.687671233, 2.167123288, 1.95890411, 3.909589041, 1.457534247,
2.336986301, 5.010958904, 1.665753425, 1.81369863, 1.191780822,
4.687671233, 2.167123288, 1.95890411, 3.909589041, 1.457534247,
2.336986301, 5.010958904, 1.665753425, 1.81369863, 1.191780822,
4.687671233, 2.167123288, 1.95890411), Genes = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("A1BG", "A1CF", "A2M",
"A2ML1"), class = "factor"), value = c(0.034459012, 0.017698878,
0.023313851, 0.010456762, 0.032674019, 0.037561831, 0.03380681,
0, 0.019954956, 0.012392427, 0.835801613, 2.265192447, 2.431409095,
5.012117956, 2.139962802, 2.371946704, 4.555234385, 0.550293401,
0.924012327, 2.274642129, 92.85639578, 79.50897642, 23.72187602,
26.86025304, 32.80504253, 222.6449054, 71.78812505, 45.76371588,
29.93976676, 22.97515484, 0.03780441, 0.005825143, 0, 0.002867985,
0.011948708, 0.02060423, 0.004636111, 0.015903347, 0.005473063,
0.033988816)), class = "data.frame", row.names = c(NA, -40L))
I want to loop over the information based on the columns Genes and value and get a result. And again I want the result to be added to the dataframe df. The result will be with low or high.
I'm trying to do this with the following code, but it doesn't work:
genes <- as.character(unique(df$Genes))
library(survival)
library(survminer)
for(i in genes){
surv_rnaseq.cut <- surv_cutpoint(
df,
time = "years",
event = "patient.vital_status",
variables = c("Genes","value"))
df$cat <- surv_categorize(surv_rnaseq.cut)
}
Along with the above result I also wanted the summary for surv_rnaseq.cut for all the four genes with mentioning its name.
Any help please. thanq
An option would be to split by 'genes' (group_split), loop over the list, apply the functions and bind the list elements after creating the column
library(survminer)
library(survival)
library(dplyr)
library(purrr)
df %>%
group_split(Genes) %>%
map_dfr(~ surv_cutpoint(.x,
time = "years",
event = "patient.vital_status",
variables = c("Genes", "value")) %>%
surv_categorize %>%
pull(value) %>%
mutate(.x, cat = .))
# A tibble: 40 x 6
# Samples patient.vital_status years Genes value cat
# <fct> <int> <dbl> <fct> <dbl> <chr>
# 1 Sample1 0 3.91 A1BG 0.0345 high
# 2 Sample2 0 1.46 A1BG 0.0177 high
# 3 Sample3 0 2.34 A1BG 0.0233 high
# 4 Sample4 0 5.01 A1BG 0.0105 high
# 5 Sample5 0 1.67 A1BG 0.0327 high
# 6 Sample6 0 1.81 A1BG 0.0376 high
# 7 Sample7 0 1.19 A1BG 0.0338 high
# 8 Sample8 1 4.69 A1BG 0 low
# 9 Sample9 0 2.17 A1BG 0.0200 high
#10 Sample10 1 1.96 A1BG 0.0124 high
# … with 30 more rows

HSD.test row names error. How do I check row names?

I have a dataframe for which I did a two-way ANOVA.
dput(m3)
structure(list(Delta = c(-40, -40, -40, -40, -31.7, -29.3, -27.8,
-26.7, -26.2, -25.4, -24.7, -23.1, -23, -22.9, -22.4, -22.2,
-21.4, -21, -20.8, -15.1, -14.9, -14.1, -6.2, -6.2, -6, -5.3,
-4.9), Location = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 3L, 2L,
3L, 3L, 3L), .Label = c("int", "pen + int", "ter + pen"), class = "factor"),
Between = c(0L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 0L, 2L, 1L, 0L,
1L, 0L, 2L, 0L, 2L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L
), Relative = structure(c(5L, 6L, 6L, 7L, 8L, 3L, 3L, 4L,
5L, 4L, 3L, 5L, 3L, 5L, 7L, 5L, 4L, 6L, 3L, 3L, 6L, 2L, 1L,
2L, 1L, 1L, 1L), .Label = c("1&2", "2&3", "2&4", "2&5", "3&4",
"3&5", "3&6", "4&6"), class = "factor")), class = "data.frame", row.names = c(NA,
-27L))
library(agricolae)
aov.2sum=aov(Delta.~Location*X.between, data=m3)
I want to analyze the data using a HSD.test as I have for another dataframe using the same features.
I am following the code format in the package manual as below.
tx <- with(m3, interaction(Location, X.between))
amod <-aov(Delta~tx, data=m3)
test=HSD.test(amod, "tx", group=TRUE)
Then I receive the following error
Error in .rowNamesDF<-(x, value = value) :
duplicate 'row.names' are not allowed
In addition: Warning message:
non-unique values when setting 'row.names': ‘int.0’, ‘pen + int.1’, ‘pen + int.2’, ‘te + int.0’, ‘te + int.1’
Upon further analysis I see that my duplicate row names error is related to my X.between feature. When I use the following code I get the same duplicate row names error:
HSD.test(amod, "X.between", group=TRUE)
>> Error in data.frame(row.names = means[, 1], means[, 2:6]) :
duplicate row.names: 0, 1, 2
How are row names chosen for the HSD.test?
Then how can I change my row names? Or just avoid this duplication error?
Thank you for all and any help.

R, calculating last 3 average

I have the following data frame in R. It contains the statistics of each player in the Olympics basketball tournament
Sample Table
Each game is denoted by a number in the game column. I would like to create a new column with the average of the last 3 games. When following examples in similar posts, my biggest down fall is having games numbers instead of actual dates that seem to be required for other methods.
Any assistance would be greatly appreciated.
Thanks
EDIT:
To clarify a little more based on some of the solutions and suggestions. For each row I would like to have the new column show the Average minutes or points from the last 3 games. So far the suggestion make each row show the average of games 3, 4, & 5.
So for example.
Player A, game = 3
Avg Pts = mean(pts game1, pts game2, pts game3)
Player B, game = 4
Avg pts = mean(pts game2 ,pts game3, pts game4)
I hope that clears it up.
Thanks
Data:
I am very new at this. I hope this is the appropriate method for sharing data.
structure(list(Player = structure(c(1L, 2L, 6L, 8L, 17L, 21L,
23L, 24L, 24L, 24L, 24L, 25L, 26L, 15L, 20L, 20L, 12L, 15L, 11L,
5L, 15L, 16L, 14L, 9L, 20L, 11L, 18L, 4L, 12L, 9L, 4L, 9L, 20L,
12L, 5L, 13L, 22L, 7L, 11L, 20L, 4L, 5L, 10L, 11L, 14L, 19L,
3L, 7L, 14L, 5L), .Label = c("Adas Juskevicius", "Alex Abrines",
"Andrew Bogut", "Bojan Bogdanovic", "Boris Diaw", "Brock Motum",
"Dario Saric", "Dwight Lewis", "Facundo Campazzo", "Ike Diogu",
"Jianlian Yi", "Jonas Maciulis", "Kevin Durant", "Luis Scola",
"Mantas Kalnietis", "Matt Dellavedova", "Miguel Marriaga", "Milos Teodosic",
"Nikola Mirotic", "Pau Gasol", "Rafa Luz", "Ricky Rubio", "Roberto Acuna",
"Vaidas Kariniauskas", "Windi Graterol", "Zeljko Sakic"), class = "factor"),
Team = structure(c(8L, 6L, 2L, 12L, 12L, 3L, 1L, 8L, 8L,
8L, 8L, 12L, 5L, 8L, 6L, 6L, 8L, 8L, 4L, 7L, 8L, 2L, 1L,
1L, 6L, 4L, 10L, 5L, 8L, 1L, 5L, 1L, 6L, 8L, 7L, 11L, 6L,
5L, 4L, 6L, 5L, 7L, 9L, 4L, 1L, 6L, 2L, 5L, 1L, 7L), .Label = c("ARG",
"AUS", "BRZ", "CHN", "CRO", "ESP", "FRA", "LTU", "NGR", "SRB",
"USA", "VEN"), class = "factor"), Pos = structure(c(3L, 4L,
2L, 5L, 2L, 5L, 1L, 2L, 2L, 2L, 2L, 1L, 4L, 3L, 1L, 1L, 4L,
5L, 2L, 2L, 5L, 3L, 2L, 3L, 1L, 4L, 5L, 2L, 2L, 3L, 2L, 3L,
1L, 2L, 2L, 4L, 3L, 4L, 4L, 1L, 2L, 2L, 2L, 4L, 1L, 2L, 1L,
4L, 1L, 2L), .Label = c("C", "PF", "PG", "SF", "SG"), class = "factor"),
game = c(4L, 5L, 4L, 5L, 3L, 4L, 3L, 1L, 2L, 3L, 4L, 5L,
5L, 3L, 2L, 3L, 3L, 4L, 3L, 3L, 2L, 4L, 3L, 3L, 5L, 5L, 5L,
4L, 2L, 2L, 2L, 5L, 4L, 4L, 2L, 2L, 1L, 4L, 4L, 1L, 5L, 4L,
3L, 2L, 4L, 2L, 2L, 3L, 2L, 1L), Status = c(0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), Drafted = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 85,
82, 80, 78, 77, 74, 68, 68, 68, 65, 64, 63, 62, 62, 61, 61,
60, 59, 59, 59, 58, 57, 57, 57, 56, 56, 56, 55, 55, 55, 55,
54, 54, 53, 53, 52, 51), Min = c(11.04, 1.44, 16.56, 2.88,
4.8, 1.92, 13.68, 3.84, 9.36, 2.64, 21.12, 17.04, 0.24, 36.48,
32.16, 23.28, 26.88, 17.28, 33.6, 28.56, 30.48, 19.92, 30.24,
25.92, 27.84, 34.8, 15.12, 36, 28.8, 29.04, 29.28, 21.36,
23.04, 18.72, 21.12, 25.2, 12.24, 27.12, 32.88, 31.92, 34.08,
18.24, 27.6, 32.64, 33.6, 32.88, 24.72, 34.8, 35.76, 31.44
), FIC = c(3.8, 1.5, 10.2, 1, 0, -1, 0.2, 0.5, -3.2, -1,
0.6, 4.5, -0.5, 15.6, 9.5, 11.1, 0.5, 7.8, 17, 16.8, 25.2,
10.5, 10, 6, 14.4, 6, 7.5, 15.5, 14.8, 6.2, 7.9, 3, 26.9,
0.8, 11.4, 16, -1, 4.9, 14.1, 18.5, 5.9, 6.5, 10, 10, 10,
8, 19, 9, 12.1, 7.5), FP = c(8, 4, 21.75, 2, 2.75, -0.5,
4.75, 1.5, 2.5, 1.25, 8.5, 13, 0, 35.25, 37, 32.25, 17, 18.5,
39.5, 34.25, 49, 19.25, 28.75, 20.25, 41.25, 27.5, 16.5,
39.25, 33.5, 29, 30.75, 13.25, 47.25, 9, 24.5, 28.5, 6.25,
19.5, 38.25, 40.25, 27.5, 17, 21.75, 37.5, 29, 21, 38.5,
30.75, 37.75, 25.75), FPM = c(0.72463768115942, 2.77777777777778,
1.31340579710145, 0.694444444444444, 0.572916666666667, -0.260416666666667,
0.347222222222222, 0.390625, 0.267094017094017, 0.473484848484848,
0.402462121212121, 0.762910798122066, 0, 0.966282894736842,
1.15049751243781, 1.38530927835052, 0.632440476190476, 1.07060185185185,
1.17559523809524, 1.19922969187675, 1.60761154855643, 0.96636546184739,
0.950727513227513, 0.78125, 1.48168103448276, 0.790229885057471,
1.09126984126984, 1.09027777777778, 1.16319444444444, 0.99862258953168,
1.05020491803279, 0.620318352059925, 2.05078125, 0.480769230769231,
1.16003787878788, 1.13095238095238, 0.51062091503268, 0.719026548672566,
1.16332116788321, 1.2609649122807, 0.806924882629108, 0.932017543859649,
0.78804347826087, 1.14889705882353, 0.863095238095238, 0.638686131386861,
1.55744336569579, 0.883620689655172, 1.05564876957494, 0.819020356234097
), PTS = c(5L, 2L, 15L, 0L, 0L, 0L, 2L, 0L, 0L, 0L, 6L, 9L,
0L, 17L, 13L, 16L, 10L, 16L, 18L, 11L, 21L, 6L, 12L, 10L,
19L, 20L, 7L, 28L, 21L, 10L, 18L, 10L, 23L, 4L, 7L, 16L,
0L, 7L, 20L, 26L, 22L, 10L, 7L, 19L, 14L, 6L, 9L, 15L, 23L,
9L), TPM = c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 2L, 0L, 0L, 2L, 2L, 0L, 0L, 1L, 0L, 1L, 2L, 1L, 0L,
1L, 3L, 3L, 1L, 1L, 1L, 5L, 0L, 1L, 2L, 0L, 1L, 2L, 3L, 4L,
0L, 0L, 3L, 2L, 0L, 1L, 3L, 3L, 1L), Ast = c(2L, 0L, 2L,
0L, 1L, 0L, 0L, 1L, 2L, 0L, 1L, 1L, 0L, 7L, 1L, 3L, 1L, 2L,
1L, 9L, 12L, 8L, 4L, 1L, 1L, 2L, 5L, 2L, 2L, 8L, 2L, 1L,
5L, 2L, 5L, 5L, 1L, 0L, 2L, 1L, 1L, 0L, 3L, 0L, 1L, 2L, 6L,
3L, 0L, 2L), Reb = c(0L, 0L, 3L, 0L, 1L, 0L, 1L, 0L, 0L,
1L, 2L, 2L, 0L, 5L, 10L, 7L, 6L, 0L, 10L, 9L, 4L, 1L, 7L,
3L, 13L, 2L, 0L, 3L, 4L, 4L, 7L, 1L, 5L, 0L, 4L, 2L, 1L,
6L, 9L, 9L, 2L, 4L, 7L, 6L, 10L, 8L, 12L, 7L, 9L, 5L), BLK = c(0L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 4L, 2L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 2L, 2L, 0L, 0L, 1L, 0L, 1L,
0L, 2L, 0L, 3L, 1L, 0L, 1L, 2L, 0L, 0L, 0L, 1L, 1L, 0L, 1L,
3L, 1L, 1L, 2L), STL = c(0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 3L, 0L, 1L, 0L, 1L,
2L, 0L, 0L, 1L, 2L, 1L, 2L, 0L, 0L, 2L, 1L, 0L, 0L, 2L, 2L,
0L, 0L, 1L, 1L, 0L, 4L, 0L, 0L, 0L, 1L, 1L, 2L), TO = c(1L,
0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 3L, 1L, 0L, 3L, 3L, 2L,
6L, 3L, 0L, 3L, 2L, 0L, 1L, 0L, 3L, 4L, 1L, 2L, 2L, 5L, 3L,
0L, 0L, 0L, 3L, 1L, 1L, 3L, 2L, 0L, 5L, 0L, 1L, 1L, 3L, 0L,
2L, 6L, 4L, 2L)), .Names = c("Player", "Team", "Pos", "game",
"Status", "Drafted", "Min", "FIC", "FP", "FPM", "PTS", "TPM",
"Ast", "Reb", "BLK", "STL", "TO"), row.names = c(NA, 50L), class = "data.frame")
Using dplyr with mtcars example data:
library(dplyr)
mtcars %>%
group_by(cyl) %>%
mutate(last3mean = mean(tail(mpg, 3)))
In your case, instead of cyl and mpg, use Player and the column to aggregate.
Using data.table, (suggested by #akrun):
data.table as.data.table(mtcar‌​s)[, .(last3mean = mean(tail(mpg,3))), by = cyl]
You can use rollmeanr from the zoo package with dplyr. This has the feature that not only the last three games of a player is averaged, but the last three game moving average is computed for each player. The code is as follows:
library(dplyr)
library(zoo)
avg.last.3 <- function (x) if (length(x) < 3) rep(NA, length(x)) else rollmeanr(x, 3, fill = NA) ## 1.
res <- df %>% group_by(Player) %>% arrange(game) %>% ## 2.
mutate(Avg.Pts=avg.last.3(PTS)) %>% ## 3.
ungroup() %>% arrange(Player,game) ## 4.
Notes:
Define a function avg.last.3 that applies the function rollmeanr with window length of 3. rollmeanr specifies align="right" to average the last three games, and we pad any result that does not have three days to average by NA. Note that the if condition in this function is needed so that:
length of x is at least the window length for rollmeanr as required by rollmeanr
avg.last.3 returns a vector that is the same length as its input as required by mutate.
First group_by the Player. Since I noted that the game column is not necessarily sorted for each Player, we sort by game in ascending order.
Use mutate to create a new column Avg.Pts resulting from applying the avg.last.3 function on a column, for example PTS.
Finally, ungroup and present the result sorted by Player followed by game
Of course, you can get the average of any number of columns by:
mutate(Avg.Pts=avg.last.3(PTS), Avg.Min=avg.last.3(Min), Avg.Ast=avg.last.3(Ast), ...)
The results averaging only the PTS column is given by (printing only the first six columns plus PTS and Avg.Pts):
print(res[,c(colnames(res)[1:6],"PTS","Avg.Pts")],n=50)
### A tibble: 50 x 8
## Player Team Pos game Status Drafted PTS Avg.Pts
## <fctr> <fctr> <fctr> <int> <int> <dbl> <int> <dbl>
##1 Adas Juskevicius LTU PG 4 0 0 5 NA
##2 Alex Abrines ESP SF 5 0 0 2 NA
##3 Andrew Bogut AUS C 2 1 53 9 NA
##4 Bojan Bogdanovic CRO PF 2 1 59 18 NA
##5 Bojan Bogdanovic CRO PF 4 1 61 28 NA
##6 Bojan Bogdanovic CRO PF 5 1 55 22 22.666667
##7 Boris Diaw FRA PF 1 1 51 9 NA
##8 Boris Diaw FRA PF 2 1 57 7 NA
##9 Boris Diaw FRA PF 3 1 68 11 9.000000
##10 Boris Diaw FRA PF 4 1 55 10 9.333333
##11 Brock Motum AUS PF 4 0 0 15 NA
##12 Dario Saric CRO SF 3 1 53 15 NA
##13 Dario Saric CRO SF 4 1 56 7 NA
##14 Dwight Lewis VEN SG 5 0 0 0 NA
##15 Facundo Campazzo ARG PG 2 1 60 10 NA
##16 Facundo Campazzo ARG PG 3 1 64 10 NA
##17 Facundo Campazzo ARG PG 5 0 59 10 10.000000
##18 Ike Diogu NGR PF 3 1 55 7 NA
##19 Jianlian Yi CHN SF 2 1 55 19 NA
##20 Jianlian Yi CHN PF 3 1 74 18 NA
##21 Jianlian Yi CHN SF 4 1 56 20 19.000000
##22 Jianlian Yi CHN SF 5 1 62 20 19.333333
##23 Jonas Maciulis LTU PF 2 1 61 21 NA
##24 Jonas Maciulis LTU SF 3 1 78 10 NA
##25 Jonas Maciulis LTU PF 4 1 58 4 11.666667
##26 Kevin Durant USA SF 2 1 57 16 NA
##27 Luis Scola ARG C 2 1 52 23 NA
##28 Luis Scola ARG PF 3 1 65 12 NA
##29 Luis Scola ARG C 4 1 54 14 16.333333
##30 Mantas Kalnietis LTU SG 2 1 68 21 NA
##31 Mantas Kalnietis LTU PG 3 1 85 17 NA
##32 Mantas Kalnietis LTU SG 4 1 77 16 18.000000
##33 Matt Dellavedova AUS PG 4 1 68 6 NA
##34 Miguel Marriaga VEN PF 3 0 0 0 NA
##35 Milos Teodosic SRB SG 5 0 62 7 NA
##36 Nikola Mirotic ESP PF 2 1 54 6 NA
##37 Pau Gasol ESP C 1 1 56 26 NA
##38 Pau Gasol ESP C 2 1 82 13 NA
##39 Pau Gasol ESP C 3 1 80 16 18.333333
##40 Pau Gasol ESP C 4 1 59 23 17.333333
##41 Pau Gasol ESP C 5 1 63 19 19.333333
##42 Rafa Luz BRZ SG 4 0 0 0 NA
##43 Ricky Rubio ESP PG 1 1 57 0 NA
##44 Roberto Acuna ARG C 3 1 0 2 NA
##45 Vaidas Kariniauskas LTU PF 1 0 0 0 NA
##46 Vaidas Kariniauskas LTU PF 2 0 0 0 NA
##47 Vaidas Kariniauskas LTU PF 3 0 0 0 0.000000
##48 Vaidas Kariniauskas LTU PF 4 0 0 6 2.000000
##49 Windi Graterol VEN C 5 0 0 9 NA
##50 Zeljko Sakic CRO SF 5 0 0 0 NA
First split the data frame up by player
playerDFs <- split(origdata, origdata["Player"])
Then subset the last 3 games
playerLast3 <- lapply(playerDFs, function(x) x[tail(order(x[["game"]]),3), ])
Finally get your means
vapply(playerLast3, colMeans, numeric(ncol(origdata)))

Developing a function to analyse rows of a data.table in R

For a sample dataframe:
df1 <- structure(list(area = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("a",
"b"), class = "factor"), region = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("a1",
"a2", "b1", "b2"), class = "factor"), weight = c(0, 1.2, 3.2,
2, 1.6, 5, 1, 0.5, 0.2, 0, 1.5, 2.3, 1.5, 1.8, 1.6, 2, 1.3, 1.4,
1.5, 1.6, 2, 3, 4, 2.3, 1.3, 2.1, 1.3, 1.6, 1.7, 1.8, 2, 1.3,
1, 0.5), var.1 = c(0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 1L,
1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 1L,
0L, 0L, 0L, 1L, 0L, 1L, 0L), var.2 = c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L,
1L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L)), .Names = c("area",
"region", "weight", "var.1", "var.2"), class = c("data.table",
"data.frame"))
I want to first produce a summary table...
area_summary <- setDT(df1)[,.(.N, freq.1 = sum(var.1==1), result = weighted.mean((var.1==1),
w = weight)*100), by = area]
...and then populate it by running the following code for each area (e.g. a, b). This looks for the highest and lowest 'result' in each region, and then produces a xtabs and calculates the relative difference (RD) before adding these to the summary table. Here I have developed the code for area 'a':
#Include only regions with highest or lowest percentage
a_cntry <- subset(df1, area=="a")
a_cntry.summary <- setDT(a_cntry)[,.(.N, freq.1 = sum(var.1==1), result = weighted.mean((var.1==1),
w = weight)*100), by = region]
#Include only regions with highest or lowest percentage
incl <- a_cntry.summary[c(which.min(result), which.max(result)),region]
region <- as.data.frame.matrix(a_cntry)
a_cntry <- a_cntry[a_cntry$region %in% incl,]
#Produce xtabs table of RD
a_cntry.var.1 <- xtabs(weight ~ var.1 + region, data=a_cntry)
a_cntry.var.1
#Produce xtabs table
RD.var.1 <- prop.test(x=a_cntry.var.1[,2], n=rowSums(a_cntry.var.1), correct = FALSE)
RD <- round(- diff(RD.var.1$estimate), 3)
RDpvalue <- round(RD.var.1$"p.value", 4)
RD
RDpvalue
#Add RD and RDpvalue tosummary table
area_summary$RD[area_summary$area == "a"] <- RD
area_summary$RDpvalue[area_summary$area == "a"] <- RDpvalue
rm(RD, RD.var.1, RDpvalue, a_cntry.var.1, incl, a_cntry,a_cntry.summary,region)
I wish to wrap this code into a function, so I can just specify the 'areas' (in the 'area' column in df1) and then the code completes all the analysis and adds the results to the summary table.
If I wanted to call my function stats, I understand it may start like this:
stats= function (df1, x) {
apply(x)
}
If anyone can start me off developing my function, I should be most grateful.

Resources