Related
I have a data set and want to create a time series of means by education and by race. However, I am struggling to make rename() accept a command rather than just a string as a new variable name.
My code:
#libraries
install.packages(c("tidyverse", "spatstat"))
lapply(c("tidyverse", "spatstat"), require, character.only = TRUE)
#calculate weighted medians by race and education
wmedians <- lapply(data[,c("race", "education")],function(vars){
data %>%
group_by((vars), year) %>%
summarize(w_median = weighted.median(wealth, weight))%>%
rename(colnames(vars) = "(vars)")
})
This gives me the following error:
Error: unexpected '=' in:
" summarize(w_median = weighted.median(wealth, weight))%>%
rename(colnames(vars) ="
Desired Output:
At the moment I get a list with two tibbles where the column name of the first column is "(vars)". Instead I would like the column name to be "race" in the first tibble and "education" in the second one.
I tried around a bit and my guess is that the rename-function generally does not accept any function (such as colnames()) as a new variable name. Do you have any idea how to get around this?
Sample of my data:
structure(list(year = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
10L, 10L, 10L), .Label = c("1989", "1992", "1995", "1998", "2001",
"2004", "2007", "2010", "2013", "2016"), class = "factor"), weight = c(9084.9691295,
1571.9511258, 191.75635451, 204.62890325, 204.62890325, 20.462890325,
10.412082059, 144.25723032, 66.170395167, 17169.253056, 5240.2917738,
768.75688855, 152.72765752, 166.52285228, 4165.0038712, 12.696579164,
1.3158341152, 3574.4768327, 18.640983311, 687.040202, 6738.6966881,
0.9648849583, 2.9597366608, 20.019564258, 895.51359665, 1.3152534108,
3244.207427, 2.0000789024, 750.95122778, 1.6580375994, 4007.7581965,
4.7569235917, 180.73948443, 237.26008744, 2.8105880617, 2.8105880617,
1.7964957199, 4883.711226, 17.268444467, 2.9783310762, 354.15138196,
162.00933944, 1.6450475811, 1.3755398392, 4174.6347012, 44.17020127,
4987.2079388, 1.3755398392, 18.01293584, 3.3426730968, 1.5455142055,
904.20169275, 12.578831203, 10051.580218, 162.70814346, 2.1257090517,
2.1257090517, 1027.7013368, 8166.4587927, 1.7239086827, 2.1374243666,
0.901741906, 2.9900010571, 33.443685091, 12913.631224, 5973.1098594,
9527.6211412, 2.2331957715, 376.47055359, 2.9488054663, 2.9488054663,
2.8570458091, 8.0625166988, 4.7867036342, 923.0539464, 2.7381019933,
5800.0572063, 4.3379657179, 0.8195417131, 6108.0937784, 23.232719795,
121.97520298, 1684.0365357, 4.7867036342, 4.8109344834, 479.31438165,
4.8109344834, 3.7209489469, 5257.7592767, 3220.9438379, 3.1738803883,
3.8910375552, 3.7209489469, 30.231850875, 3277.3748665, 32.237421329,
2.7966498146, 3208.415157, 34.375315295, 30.200771547, 31.811971048,
20.701306688, 2.7966498146, 31.968899323, 33.280487562, 2548.6399138,
32.94638396, 2.7966498146, 2.6033164134, 30.815250688, 23.702590485,
31.465956118, 29.46116036, 760.36545895, 8.3114397117, 23.702590485,
4.6504256805, 3204.2440292, 6.8002099257, 35.813850525), race = structure(c(1L,
1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L,
1L, 4L, 1L, 1L, 4L, 4L, 4L, 1L, 4L, 1L, 4L, 1L, 4L, 1L, 4L, 1L,
1L, 4L, 4L, 4L, 1L, 4L, 4L, 1L, 1L, 4L, 1L, 1L, 4L, 1L, 4L, 4L,
4L, 4L, 1L, 4L, 1L, 1L, 4L, 1L, 1L, 1L, 4L, 4L, 4L, 1L, 1L, 1L,
1L, 1L, 4L, 1L, 4L, 4L, 4L, 4L, 4L, 1L, 4L, 1L, 4L, 4L, 1L, 1L,
1L, 1L, 4L, 4L, 1L, 4L, 4L, 1L, 1L, 1L, 1L, 4L, 4L, 1L, 4L, 4L,
1L, 1L, 4L, 1L, 4L, 4L, 4L, 1L, 1L, 1L, 4L, 4L, 1L, 1L, 1L, 4L,
1L, 4L, 4L, 4L, 1L, 4L, 1L), .Label = c("black", "Hispanic",
"other", "white"), class = "factor"), education = structure(c(2L,
1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L,
2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L,
2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("college degree", "no college",
"some college"), class = "factor"), wealth = c(370932.98, 10170000.94,
12598660.39, 114293258.81, 211275067.86, 290691670.17, 85726008.16,
230065771.49, 246480115.73, 349587.55, 378694.96, 3164512.87,
8495442.34, 6020105.92, 505133.05, 367073411.82, 2928346179.67,
533643.21, 554996993.66, 5648836.48, 392098.63, 692454429.71,
872798466.66, 352917443.83, 798505.65, 1250534235.01, 645694.94,
570963643.71, 10983328.56, 360732249.01, 717840.77, 365513260.49,
7204246.71, 8901952.1, 473348324.51, 686068914.54, 566191645.55,
451622.81, 376484717.9, 702785331.9, 6677625.87, 5826581.94,
538339875.62, 81126854.16, 1073395.38, 595512233.49, 769008.48,
363748981.28, 203466108.48, 801021687.19, 744505545.07, 5219227.59,
132716087.05, 813859.01, 4429075.76, 376023173.93, 418753292.21,
591392.47, 986060.58, 740193054.24, 305729499.91, 815008777.34,
292998224.45, 25586473.36, 1482007.57, 832890.03, 1431390.64,
465587944.43, 31578347.08, 771207206.27, 774386788.46, 283388639.66,
491131539.34, 1106105605.16, 15792325.59, 448260665.36, 1598512.35,
256789830.23, 1327614423.44, 1034918.02, 44314918.71, 47310703.56,
1174945.19, 1050196871.69, 791040687.75, 40226229.67, 1193945180.43,
882118783.02, 865693.49, 598203.92, 94409003.49, 321989895.39,
611845894.05, 414941965.04, 1697121.78, 192234467.65, 1200517207.56,
1522890.02, 42378401.16, 269560657.85, 49377539.54, 450366559.75,
1362371354.42, 254034284.01, 65726116.22, 840350.29, 87111742.24,
1116803883.14, 1703800000, 248761000, 104962000, 111565400, 1224715000,
1405000, 1196478000, 981045000, 535052000, 2077700, 444554200,
41907500)), row.names = c(NA, -120L), groups = structure(list(
year = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 7L,
7L, 7L, 7L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 10L, 10L, 10L,
10L), .Label = c("1989", "1992", "1995", "1998", "2001",
"2004", "2007", "2010", "2013", "2016"), class = "factor"),
education = structure(c(1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L,
1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L,
2L), .Label = c("college degree", "no college", "some college"
), class = "factor"), race = structure(c(1L, 4L, 1L, 4L,
1L, 4L, 1L, 4L, 1L, 4L, 1L, 4L, 1L, 4L, 1L, 4L, 1L, 4L, 1L,
4L, 1L, 4L, 1L, 4L, 1L, 4L, 1L, 4L, 1L, 4L, 1L, 4L, 1L, 4L,
1L, 4L, 1L, 4L, 1L, 4L), .Label = c("black", "Hispanic",
"other", "white"), class = "factor"), .rows = structure(list(
c(2L, 3L, 12L), c(5L, 6L, 9L), c(1L, 10L, 11L), c(4L,
7L, 8L), c(13L, 14L, 20L), c(17L, 22L, 23L), c(15L, 18L,
21L), c(16L, 19L, 24L), c(29L, 33L, 34L), c(26L, 35L,
36L), c(25L, 27L, 31L), c(28L, 30L, 32L), c(41L, 42L,
44L), c(37L, 40L, 43L), c(38L, 45L, 47L), c(39L, 46L,
48L), c(52L, 55L, 57L), c(50L, 51L, 60L), c(54L, 58L,
59L), c(49L, 53L, 56L), c(63L, 64L, 69L), c(62L, 70L,
71L), 65:67, c(61L, 68L, 72L), c(75L, 81L, 82L), c(74L,
79L, 84L), c(77L, 80L, 83L), c(73L, 76L, 78L), c(86L,
91L, 92L), c(85L, 87L, 88L), c(89L, 90L, 95L), c(93L,
94L, 96L), c(101L, 105L, 107L), c(97L, 103L, 108L), c(98L,
99L, 106L), c(100L, 102L, 104L), 110:112, c(109L, 113L,
115L), c(114L, 118L, 120L), c(116L, 117L, 119L)), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -40L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
Thanks a lot in advance!
Pass column names in lapply and use .data to refer them in group_by so you don't need rename at all as column names would be maintained.
library(dplyr)
library(spatstat)
lapply(c("race", "education"),function(vars){
data %>%
group_by(.data[[vars]], year) %>%
summarize(w_median = weighted.median(wealth, weight))
}) -> result
result
I have several categorical variables and I need to plot its horizontal barplots in function of the frequency of their modalities. for example, if I want to plot horizontal barplot of the variable INTERET_ENVIRONNEMENT knowing that its modalities are:
> unique(DATABASE$INTERET_ENVIRONNEMENT)
[1] Beaucoup Un peu Pas du tout
Levels: Beaucoup Pas du tout Un peu
then using the code above :
ords <- c("Beaucoup", "Un peu", "Pas du tout")
ggplot(DATABASE, aes(x = INTERET_ENVIRONNEMENT)) +
geom_bar(fill = "orange", width = 0.7) +
scale_x_discrete(limits = ords) +
coord_flip() +
xlab("Storm Type") + ylab("Number of Observations")
I get this
Now I want to add all other categorical variables to get their horizontal bar plots in the same plot.
For example, if I want to add also the INTERET_COMPOSITION variable which has the same modalities ("Beaucoup", "Un peu", "Pas du tout").
I try using this code
ggplot(DATABASE, aes(x = INTERET_ENVIRONNEMENT)) +
geom_bar(fill = "orange", width = 0.7) +
scale_x_discrete(limits = ords) +
coord_flip() +
xlab("Storm Type") + ylab("Number of Observations")+
facet_wrap(~INTERET_COMPOSITION)
But, it doesn't give the needed results.
To make my example reproductible, this is a data set which contains 4 categorical variables having same modalities:
structure(list(INTERET_COMPOSITION = structure(c(1L, 1L, 1L,
3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 3L, 1L,
1L, 1L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Beaucoup",
"Pas du tout", "Un peu"), class = "factor"), INTERET_ENVIRONNEMENT = structure(c(1L,
3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 1L,
1L, 1L, 1L, 3L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("Beaucoup", "Pas du tout", "Un peu"), class = "factor"),
INTERET_ORIGINE_GEO = structure(c(1L, 2L, 1L, 1L, 3L, 1L,
3L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 2L, 1L, 1L, 3L, 1L, 1L,
1L, 1L, 3L, 3L, 1L, 2L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
3L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L), .Label = c("Beaucoup",
"Pas du tout", "Un peu"), class = "factor"), INTERET_ALIM_NATURELLE = structure(c(1L,
3L, 3L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 1L, 3L, 1L, 1L, 1L, 2L,
3L, 3L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 3L, 1L, 1L, 3L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("Beaucoup", "Pas du tout", "Un peu"
), class = "factor")), .Names = c("INTERET_COMPOSITION",
"INTERET_ENVIRONNEMENT", "INTERET_ORIGINE_GEO", "INTERET_ALIM_NATURELLE"
), row.names = c(1L, 2L, 3L, 5L, 9L, 13L, 14L, 16L, 18L, 19L,
20L, 24L, 27L, 29L, 30L, 32L, 33L, 35L, 36L, 37L, 39L, 44L, 49L,
51L, 52L, 53L, 55L, 56L, 61L, 62L, 63L, 65L, 66L, 67L, 71L, 74L,
75L, 80L, 81L, 84L, 86L, 90L, 92L, 95L, 96L, 99L, 100L, 103L,
104L, 107L), class = "data.frame")
>
Please, how should I do to plot their horizontal barplot in same figure?
You have to transform your data from wide to long
library(tidyverse)
d %>%
gather(k, v) %>%
ggplot(aes(v)) +
geom_bar(fill = "orange", width = 0.7) +
coord_flip() +
facet_wrap(~k)
I'm having the following data on an experiment, where I want to find out, how an bacterium reacts on two similar levels (nucleic acids) to a treatment.
Treatment happened after the sampling on day 0 (vertical dashed line). As you can see, it got more abundant (line is average, dots are measured triplicates). I have 3 technical replicates (doing the lab work 3 times on the same sample) but no biological replicates.
For publication purposes, I want to show that the induced change is significant. So far I used a two tailed t test for heteroscedastic samples, using the 3 sample points day -25 to 0 as sample group 1 and 5 sample points day 3 to 17 as sample group 2 (this is the range where most of my bacteria reacted).
Afterwards I performed the Bonferroni correction on the p values to correct for multiple testing. But is this the correct way and is it possible with only technical replicates?
I'm finding many hints on fitting models to my graph, but I only want to test for statistic significance of difference between before and after treatment. So I'm searching for the correct statistics and also how to apply it in R. Any help appreciated!
here is the plot:
require(ggplot2)
require(scales)
ggplot(data=sample_data, aes(x=days-69,y=value,colour=nucleic_acid,group=nucleic_acid,lty=nucleic_acid))+
geom_vline(aes(xintercept=0),linetype="dashed", size=1.2)+
geom_point(aes(),colour="black")+
stat_summary(aes(colour=nucleic_acid),colour="black",fun.y="mean", geom="line", size=1.5)+
scale_linetype_manual(values=c("dna"=1,"cdna"=4),
name="Nucleic acid ",
breaks=c("cdna","dna"),
labels=c("16S rRNA","16S rDNA"))+
scale_x_continuous(breaks = scales::pretty_breaks(n = 20))+
theme_bw()+
scale_y_continuous(label= function(x) {ifelse(x==0, "0", parse(text=gsub("[+]", "", gsub("e", " %*% 10^", scientific_format()(x)))))})+
theme(axis.title.y = element_text(angle=90,vjust=0.5))+
theme(axis.text=element_text(size=12))+
theme(legend.text=element_text(size=11))+
theme(panel.grid.major=element_line(colour = NA, size = 0.2))+
theme(panel.grid.minor=element_line(colour = NA, size = 0.5))+
theme(legend.position="bottom")+
theme(legend.background = element_rect(fill="grey90",linetype="solid"))+
labs(x="Days",
y=expression(atop("Absolute abundance in cell equivalents",bgroup("[",relative~abundance~x~cells~mL^{-1},"]"))))
and here is my data:
sample_data<-structure(list(time = c(10L, 10L, 10L, 10L, 10L, 10L, 11L, 11L,
11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L, 13L,
13L, 13L, 13L, 14L, 14L, 14L, 14L, 14L, 14L, 15L, 15L, 15L, 15L,
15L, 15L, 16L, 16L, 16L, 16L, 16L, 16L, 17L, 17L, 17L, 17L, 18L,
18L, 18L, 18L, 18L, 18L, 19L, 19L, 19L, 19L, 19L, 19L, 4L, 4L,
4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L,
7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L,
9L, 9L), days = c(83L, 83L, 83L, 83L, 83L, 83L, 86L, 86L, 86L,
86L, 86L, 86L, 91L, 91L, 91L, 91L, 91L, 91L, 98L, 98L, 98L, 98L,
98L, 98L, 105L, 105L, 105L, 105L, 105L, 105L, 112L, 112L, 112L,
112L, 112L, 112L, 119L, 119L, 119L, 119L, 119L, 119L, 126L, 126L,
126L, 126L, 133L, 133L, 133L, 133L, 133L, 133L, 140L, 140L, 140L,
140L, 140L, 140L, 44L, 44L, 44L, 44L, 44L, 44L, 62L, 62L, 62L,
62L, 62L, 62L, 69L, 69L, 69L, 69L, 69L, 69L, 72L, 72L, 72L, 72L,
72L, 72L, 76L, 76L, 76L, 76L, 76L, 76L, 79L, 79L, 79L, 79L, 79L,
79L), parallel = c(3L, 1L, 2L, 2L, 3L, 1L, 2L, 3L, 3L, 2L, 1L,
1L, 2L, 1L, 3L, 3L, 1L, 2L, 2L, 3L, 3L, 1L, 1L, 2L, 2L, 3L, 1L,
1L, 3L, 2L, 1L, 1L, 2L, 3L, 3L, 2L, 2L, 3L, 3L, 1L, 1L, 2L, 3L,
1L, 1L, 3L, 2L, 3L, 1L, 1L, 2L, 3L, 1L, 2L, 3L, 3L, 1L, 2L, 2L,
3L, 3L, 1L, 1L, 2L, 2L, 3L, 1L, 1L, 3L, 2L, 1L, 2L, 3L, 3L, 1L,
2L, 2L, 3L, 3L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 3L, 3L, 1L, 2L, 3L,
3L, 1L, 2L), nucleic_acid = structure(c(1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L,
1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L,
2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L,
2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("cdna", "dna"), class = "factor"),
habitat = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "water", class = "factor"),
value = c(5316639.62, 6402573.912, 6294710.95, 2369809.996,
2679661.691, 2105693.166, 2108794.224, 2487177.041, 6021765.438,
5524939.499, 6016021.786, 2628427.206, 3164229.113, 896068.7656,
2966515.364, 4436008.425, 1860580.149, 3911309.508, 888489.0268,
1004334.365, 1141636.992, 961140.0729, 1072009.18, 1134997.852,
668013.4333, 459645.1058, 645944.1129, 702293.6865, 590620.3693,
642136.7523, 932531.1588, 1224299.065, 1502344.5, 1545034.46,
1122002.798, 1411050.57, 1465061.711, 1378876.488, 810348.2823,
1361496.248, 1056558.288, 897876.4169, 931519.9524, 1165768.09,
957873.9045, 746011.7558, 624116.5603, 522209.2283, 551120.1371,
440096.4446, 565108.4447, 373304.8604, 266595.7171, 333767.4042,
185612.6681, 144899.8736, 173739.3969, 211490.827, 223815.0867,
296455.4243, 1278759.217, 247292.4355, 1171554.199, 1146278.577,
227443.8462, 233542.6719, 253224.2629, 875040.4892, 1151921.616,
1285744.479, 355381.9156, 110724.7928, 252238.9632, 912865.3372,
608269.6498, 500307.5301, 774955.9598, 1374106.94, 3121909.308,
1071086.757, 3033665.589, 2984567.998, 1396313.444, 1356465.773,
4480581.956, 4273141.231, 4957691.655, 1910056.657, 5520085.32,
5094686.657, 5990052.759, 2272441.566, 1513268.608, 1821716.75
), treatment2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Treatment", class = "factor")), .Names = c("time",
"days", "parallel", "nucleic_acid", "habitat", "value", "treatment2"
), class = "data.frame", row.names = c(51243L, 51244L, 51245L,
51246L, 51247L, 51248L, 51255L, 51256L, 51257L, 51258L, 51259L,
51260L, 51267L, 51268L, 51269L, 51270L, 51271L, 51272L, 51279L,
51280L, 51281L, 51282L, 51283L, 51284L, 51291L, 51292L, 51293L,
51294L, 51295L, 51296L, 51303L, 51304L, 51305L, 51306L, 51307L,
51308L, 51315L, 51316L, 51317L, 51318L, 51319L, 51320L, 51326L,
51327L, 51328L, 51329L, 51336L, 51337L, 51338L, 51339L, 51340L,
51341L, 51348L, 51349L, 51350L, 51351L, 51352L, 51353L, 51360L,
51361L, 51362L, 51363L, 51364L, 51365L, 51372L, 51373L, 51374L,
51375L, 51376L, 51377L, 51384L, 51385L, 51386L, 51387L, 51388L,
51389L, 51396L, 51397L, 51398L, 51399L, 51400L, 51401L, 51408L,
51409L, 51410L, 51411L, 51412L, 51413L, 51420L, 51421L, 51422L,
51423L, 51424L, 51425L))
If you want to test for significance of the effect of your treatment and you know how to fit model(s) on your data, you can simply fit a model which includes your treatment effect and a model which doesn't. Then compare the models by means of a likelihood ratio test.
In R it is pretty straightforward (I assume for simplicity a linear model, which anyway may not be the best choice, based on your data):
# Models fit
model_effect <- lm(y~Time + Treatment, data)
model_null <- lm(y~Time, data)
# Models comparison
anova(model_effect, model_null)
I have a binomial GLM in R, with several predictors that are both continuous and categorical.
The response variable is "Presence", which is binary (0/1).
Length is a continuous variable, while all others are categorical.
I am trying to plot predictions for each of the variables in the final model, particularly for "length", but I'm having difficulties.
My data are the following:
MyData<-structure(list(site = structure(c(3L, 1L, 3L, 2L, 1L, 4L, 3L,
4L, 1L, 2L, 4L, 5L, 5L, 1L, 4L, 3L, 2L, 4L, 1L, 4L, 5L, 1L, 5L,
4L, 3L, 1L, 3L, 5L, 5L, 4L, 4L, 3L, 1L, 5L, 1L, 3L, 1L, 4L, 4L,
3L, 4L, 4L, 2L, 3L, 1L, 4L, 2L, 1L, 1L, 4L, 4L, 4L, 1L, 3L, 3L,
2L, 1L, 4L, 2L, 5L, 5L, 3L, 3L, 2L, 5L, 2L, 4L, 5L, 2L, 4L, 4L,
2L, 5L, 2L, 3L, 5L, 4L, 4L, 5L, 1L, 1L, 3L, 2L, 4L, 3L, 1L, 4L,
3L, 1L, 4L, 3L, 3L, 4L, 5L, 1L, 3L, 2L, 3L, 2L, 3L, 2L, 1L, 1L,
5L, 5L, 1L, 5L, 2L, 3L, 4L, 4L, 3L, 2L, 3L, 3L, 5L, 3L, 3L, 3L,
5L, 1L, 5L, 2L, 3L, 4L, 5L, 5L, 1L, 4L, 2L, 5L, 3L, 2L, 5L, 4L,
3L, 3L, 3L, 1L, 1L, 4L, 1L, 2L, 4L, 5L, 1L, 1L, 2L, 2L, 5L, 3L,
4L, 4L, 1L, 5L, 2L, 4L, 3L, 1L, 1L, 3L, 2L, 1L, 3L, 4L, 3L, 1L,
5L, 3L, 3L, 3L, 4L, 1L, 1L, 3L, 4L, 3L, 1L, 1L, 1L, 1L, 5L, 1L,
3L, 4L, 3L, 2L, 1L, 1L, 2L, 5L, 2L, 1L, 5L, 3L, 1L, 4L, 1L, 3L,
3L, 3L, 3L, 5L, 1L, 4L, 1L, 1L, 3L, 3L, 4L, 1L, 3L, 3L, 4L, 2L,
5L, 5L, 5L, 1L, 4L, 4L, 3L, 1L, 2L, 3L, 1L, 3L, 1L, 1L, 4L, 3L,
1L, 1L, 5L, 3L, 1L), .Label = c("R1a", "R1b", "R2", "Za", "Zb"
), class = "factor"), species = structure(c(1L, 1L, 3L, 3L, 3L,
1L, 3L, 1L, 4L, 3L, 1L, 1L, 1L, 3L, 1L, 3L, 3L, 1L, 3L, 1L, 1L,
1L, 1L, 4L, 3L, 4L, 3L, 1L, 1L, 1L, 1L, 1L, 4L, 1L, 3L, 1L, 4L,
3L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 2L, 3L, 1L, 1L, 3L, 1L, 1L, 1L,
1L, 3L, 3L, 1L, 2L, 3L, 1L, 2L, 1L, 1L, 3L, 1L, 3L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 3L, 1L, 3L,
3L, 1L, 3L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 3L, 4L, 3L, 1L,
1L, 3L, 1L, 1L, 4L, 1L, 3L, 3L, 1L, 1L, 1L, 3L, 3L, 3L, 2L, 4L,
1L, 3L, 1L, 3L, 1L, 3L, 3L, 1L, 1L, 1L, 3L, 4L, 3L, 1L, 1L, 3L,
1L, 1L, 4L, 1L, 3L, 1L, 3L, 1L, 2L, 1L, 1L, 2L, 3L, 3L, 3L, 3L,
1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 2L, 2L, 3L, 3L, 3L, 3L, 1L,
3L, 1L, 4L, 3L, 1L, 4L, 1L, 1L, 3L, 1L, 1L, 3L, 1L, 1L, 3L, 3L,
1L, 4L, 3L, 4L, 3L, 1L, 1L, 2L, 3L, 1L, 1L, 1L, 2L, 3L, 4L, 3L,
1L, 1L, 4L, 1L, 1L, 2L, 1L, 1L, 3L, 3L, 1L, 3L, 2L, 4L, 3L, 3L,
1L, 3L, 1L, 4L, 1L, 1L, 4L, 1L, 3L, 1L, 3L, 3L, 3L, 1L, 3L, 1L,
1L, 1L, 3L, 1L, 1L, 1L, 3L), .Label = c("Monogyna", "Other",
"Prunus", "Rosa"), class = "factor"), aspect = structure(c(4L,
4L, 4L, 4L, 4L, 3L, 4L, 3L, 4L, 4L, 3L, 4L, 4L, 4L, 3L, 3L, 4L,
3L, 4L, 3L, 1L, 4L, 4L, 3L, 2L, 4L, 4L, 4L, 4L, 3L, 3L, 4L, 4L,
4L, 4L, 2L, 4L, 3L, 3L, 1L, 3L, 3L, 4L, 4L, 4L, 3L, 4L, 4L, 4L,
3L, 3L, 3L, 4L, 1L, 3L, 4L, 4L, 3L, 4L, 4L, 4L, 3L, 3L, 4L, 1L,
4L, 3L, 4L, 4L, 3L, 3L, 4L, 4L, 4L, 4L, 1L, 3L, 3L, 4L, 4L, 4L,
2L, 4L, 3L, 3L, 4L, 3L, 4L, 4L, 3L, 4L, 3L, 3L, 4L, 4L, 3L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 1L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 4L, 4L,
3L, 2L, 3L, 1L, 2L, 5L, 2L, 4L, 4L, 4L, 3L, 3L, 1L, 2L, 4L, 3L,
4L, 4L, 3L, 4L, 4L, 3L, 4L, 4L, 3L, 4L, 4L, 3L, 4L, 4L, 3L, 1L,
4L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 4L, 4L, 4L, 3L, 4L, 4L, 4L, 4L,
4L, 4L, 3L, 3L, 3L, 4L, 4L, 3L, 4L, 2L, 3L, 4L, 4L, 2L, 3L, 2L,
4L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 2L, 4L, 3L, 4L, 4L, 4L, 3L, 4L, 4L, 4L, 3L, 4L, 4L, 4L, 3L,
3L, 4L, 2L, 5L, 3L, 4L, 2L, 4L, 4L, 4L, 3L, 3L, 3L, 4L, 4L, 2L,
4L, 3L, 4L, 4L, 3L, 4L, 4L, 4L, 3L, 2L, 4L), .Label = c("East",
"Flat", "North", "South", "West"), class = "factor"), length = c(260L,
60L, 60L, 40L, 240L, 80L, 30L, 100L, 100L, 200L, 70L, 50L, 60L,
35L, 120L, 60L, 500L, 40L, 20L, 70L, 250L, 80L, 50L, 130L, 350L,
170L, 50L, 60L, 90L, 50L, 40L, 110L, 60L, 70L, 70L, 500L, 140L,
50L, 50L, 360L, 50L, 150L, 60L, 270L, 280L, 130L, 130L, 50L,
60L, 30L, 70L, 70L, 60L, 400L, 20L, 30L, 70L, 160L, 340L, 100L,
210L, 60L, 70L, 130L, 50L, 40L, 50L, 80L, 390L, 40L, 110L, 130L,
40L, 230L, 120L, 70L, 80L, 80L, 90L, 70L, 150L, 120L, 50L, 100L,
120L, 10L, 40L, 80L, 180L, 160L, 200L, 40L, 70L, 90L, 50L, 40L,
80L, 80L, 70L, 480L, 90L, 60L, 100L, 140L, 190L, 20L, 70L, 360L,
70L, 130L, 60L, 50L, 320L, 210L, 130L, 180L, 90L, 20L, 300L,
90L, 50L, 130L, 70L, 70L, 40L, 40L, 50L, 40L, 100L, 20L, 70L,
100L, 340L, 70L, 110L, 40L, 230L, 200L, 80L, 35L, 110L, 200L,
50L, 110L, 100L, 50L, 150L, 110L, 50L, 50L, 40L, 70L, 80L, 60L,
100L, 90L, 40L, 300L, 140L, 180L, 140L, 40L, 190L, 100L, 170L,
40L, 120L, 15L, 70L, 340L, 40L, 40L, 70L, 60L, 130L, 140L, 170L,
120L, 90L, 130L, 210L, 50L, 180L, 120L, 100L, 50L, 90L, 70L,
360L, 80L, 30L, 170L, 70L, 300L, 40L, 130L, 120L, 90L, 40L, 40L,
140L, 80L, 400L, 70L, 80L, 60L, 420L, 320L, 200L, 40L, 50L, 70L,
50L, 80L, 50L, 110L, 100L, 120L, 170L, 20L, 110L, 20L, 20L, 30L,
30L, 90L, 150L, 80L, 40L, 90L, 300L, 30L, 70L, 50L, 90L, 200L
), sun = structure(c(1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L,
3L, 3L, 3L, 3L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 2L, 3L, 1L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 1L, 1L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 1L, 3L,
3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 3L,
3L, 2L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 1L, 2L, 1L,
1L, 3L, 3L, 3L, 2L, 3L, 3L, 2L, 3L, 3L, 1L, 3L, 3L, 3L, 1L, 3L,
1L, 3L, 3L, 2L, 1L, 3L, 3L, 1L, 1L, 3L, 1L, 3L, 3L, 1L, 1L, 1L,
2L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 3L, 2L, 1L, 3L, 1L, 1L, 3L, 3L,
1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 1L, 1L, 1L,
3L, 3L, 3L, 1L, 3L, 3L, 1L, 3L, 3L, 1L, 3L, 3L, 1L, 3L, 3L, 3L,
3L, 1L, 3L, 1L, 3L, 1L, 1L, 3L, 3L, 3L, 1L, 3L, 3L, 3L, 1L, 1L,
1L, 1L, 3L, 3L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L,
3L, 3L, 3L, 3L, 2L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 2L,
3L, 3L, 3L, 3L, 3L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 1L,
1L, 3L, 3L, 3L, 3L, 1L, 3L, 1L, 3L, 3L, 3L, 1L, 1L, 3L, 3L, 2L,
3L, 3L), .Label = c("Half", "Shade", "Sun"), class = "factor"),
leaf = structure(c(2L, 2L, 4L, 2L, 2L, 2L, 2L, 2L, 4L, 2L,
2L, 4L, 4L, 4L, 2L, 2L, 2L, 4L, 4L, 2L, 2L, 4L, 2L, 2L, 1L,
2L, 2L, 4L, 2L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 4L, 2L,
2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 4L, 2L,
2L, 4L, 1L, 2L, 4L, 1L, 2L, 4L, 2L, 4L, 2L, 2L, 2L, 1L, 4L,
4L, 1L, 4L, 1L, 2L, 4L, 3L, 2L, 2L, 2L, 2L, 4L, 2L, 4L, 2L,
2L, 2L, 2L, 2L, 4L, 1L, 2L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
1L, 4L, 2L, 2L, 1L, 4L, 2L, 2L, 2L, 1L, 4L, 2L, 2L, 1L, 1L,
1L, 2L, 4L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 4L, 2L, 2L, 2L, 2L,
4L, 2L, 2L, 4L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L,
2L, 2L, 1L, 2L, 2L, 2L, 2L, 4L, 2L, 2L, 2L, 4L, 4L, 1L, 1L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 4L, 2L, 2L, 2L, 4L, 2L, 2L, 2L,
1L, 1L, 2L, 1L, 2L, 2L, 4L, 2L, 2L, 2L, 2L, 2L, 4L, 1L, 2L,
4L, 2L, 2L, 1L, 2L, 2L, 4L, 2L, 4L, 4L, 2L, 2L, 1L, 2L, 2L,
2L, 2L, 4L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 4L, 1L, 1L, 2L,
1L, 2L, 2L, 2L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 4L, 2L, 4L, 2L,
2L), .Label = c("Large", "Medium", "Scarce", "Small"), class = "factor"),
Presence = c(0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L,
0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L,
0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L,
0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
1L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 0L
)), .Names = c("site", "species", "aspect", "length", "sun",
"leaf", "Presence"), row.names = c(NA, 236L), class = "data.frame")
(note that this is a reduced dataset, and I have already removed variables that were dropped during model selection)
The optimal model is:
model <- glm(Presence ~ site + species + aspect + length + sun
+ leaf, data=MyData, family=binomial)
I tried the following, but it wants the other variables too, so I get an error:
plot(MyData$length, MyData$Presence)
mydat1 <- data.frame(length = seq(from = 10, to = 500, by = 1)
pred1 <- predict(model, newdata = mydat1, type = "response")
lines(MyData$length, pred1)
So I tried specifying all variables, but then it only puts a horizontal line through the presence data points (and that means I need to specify all possible combinations of factor variables I suppose):
plot(MyData$length, MyData$Presence)
mydat2 <- data.frame(length = seq(from = 10, to = 500, by = 1),
site = "R1a",
species = "Monogyna",
aspect = "Flat",
sun = "Sun",
leaf = "Scarce")
pred2 <- predict(model, newdata = mydat2, type = "response")
lines(MyData$length, pred2)
Finally, I tried the following code:
pred <- predict(model, type = "response")
par(mfrow=c(2,2))
for(i in names(MyData)){
plot(MyData[,i],pred,xlab=i, ylab="Probability")
}
I am confused by this last one, as I am not able to obtain the curve, plus the output gives me predicted values for variables that are not even in the optimal model.
What I should expect under this model, is a sinusoidal curve, I suppose. But that's not what I'm getting.
How can I produce a meaningful plot of predictions?
Any help would be greatly appreciated.
I would use the effects package for some easier results for a single predictor. Here is how:
library(effects)
fit <- as.data.frame(effect('length', model, xlevels = 100))
Plotting is easy (although note the overplotting):
plot(MyData$length, MyData$Presence)
lines(fit$length, fit$fit)
Or we can use ggplot2:
library(ggplot2)
ggplot() +
geom_count(aes(length, Presence), MyData) +
geom_line(aes(length, fit), fit, size = 1, col = 'red') +
geom_ribbon(aes(length, ymin = lower, ymax = upper), fit, alpha = 0.15) +
scale_size_area()
We can see that the effect of length is not very impressive.
I have the following
t <- structure(list(name = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("Alice", "Bob",
"Jane Doe", "John Doe"), class = "factor"), school = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("Alice School",
"Bob School", "Someother School", "Someschool College"), class = "factor"),
group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("A", "B"), class = "factor"),
question = structure(c(2L, 4L, 6L, 8L, 1L, 3L, 5L, 7L, 2L,
4L, 6L, 8L, 1L, 3L, 5L, 7L, 2L, 4L, 6L, 8L, 1L, 3L, 5L, 7L,
2L, 4L, 6L, 8L, 1L, 3L, 5L, 7L), .Label = c("q1", "q2", "q3",
"q4", "q5", "q6", "q7", "q8"), class = "factor"), mark = c(0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 1L,
1L), subject = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("C", "M"), class = "factor")), .Names = c("name",
"school", "group", "question", "mark", "subject"), row.names = c(7L,
15L, 23L, 31L, 3L, 11L, 19L, 27L, 8L, 16L, 24L, 32L, 4L, 12L,
20L, 28L, 6L, 14L, 22L, 30L, 2L, 10L, 18L, 26L, 5L, 13L, 21L,
29L, 1L, 9L, 17L, 25L), class = "data.frame")
and I need to produce a data frame in which each student has one combined mark for each subject. The combination is simply a sum of the marks on each question. So, for example, Jane Doe will have 3 on subject C and 2 on subject M. I've been banging my head for long enough with Reduce and other approaches. I could possibly solve this in a very procedural way, but if I could do that with a one-liner (or close approximation), I'd be happier. I'm sure it can be done...
You said it in your question; you want to group_by student and subject and compute the sum
library(tidyverse)
asdf %>%
group_by(name, subject) %>%
summarise(score = sum(mark))
Here a data.table solution:
library(data.table)
setDT(t)[, sum(mark), by = list(name, subject)]
And just for completeness, base R:
aggregate(mark ~ name + subject, data=t, sum)
This says "aggregate the response variable mark by the grouping variables name and subject, using sum as the aggregation function".