ggplot with three factors and dual error bars - r

I need an xy plot which plots means and error bars for x and y with three factors. The three factors are Year (2004-2012), Species (FW, HB), and Region (Kodiak, Shumagin Islands); xmean=mean d13C and ymean=mean 15N.
I can get reasonably close using the following code, but am missing one factor and it's not very aesthetically pleasing. I also get a warning message for exceeding the shape palette.
library(ggplot2)
library(plyr)
GAP_Whales<-structure(list(Species = structure(c(2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 2L), .Label = c("FW", "HB"), class = "factor"), Year = c(2007L,
2007L, 2007L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L,
2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L,
2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L,
2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L,
2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L, 2004L,
2004L, 2004L, 2004L, 2004L, 2005L, 2005L, 2005L, 2005L, 2005L,
2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L,
2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L,
2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L,
2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L,
2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L,
2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L,
2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2005L,
2005L, 2005L, 2005L, 2005L, 2005L, 2005L, 2006L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2007L, 2005L, 2005L, 2005L, 2005L,
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L,
2001L, 2001L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2008L, 2008L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2008L, 2008L, 2008L, 2008L, 2008L, 2008L,
2008L, 2008L, 2008L, 2008L, 2008L, 2008L, 2008L, 2008L, 2008L,
2008L, 2008L, 2009L, 2009L, 2009L, 2009L, 2009L, 2009L, 2009L,
2009L, 2009L, 2005L, 2005L, 2007L, 2007L, 2007L, 2008L, 2008L,
2008L, 2008L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L,
2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2010L, 2010L, 2010L,
2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L,
2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L,
2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L,
2010L, 2010L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L,
2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L,
2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2007L
), Region = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("Kodiak", "Shumagin Is."), class = "factor"), d13C = c(-17.741,
-17.841, -17.382, -16.955, -17.504, -17.42814286, -15.89, -16.421,
-17.21328571, -17.90142857, -17.654, -19.225, -19.03361128, -18.29057143,
-17.28, -16.897, -18.18585714, -18, -17.619, -17.47014286, -18.382,
-16.807, -18.55242857, -18.527, -17.72557143, -17.06471429, -18.02757143,
-17.599, -17.57614286, -17.36385714, -17.19728571, -18.09871429,
-17.3, -15.928, -17.26071429, -17.85271429, -17.39342857, -16.98,
-16.847, -17.84728571, -16.673, -17.174, -16.277, -17.965, -17.60985714,
-16.6, -17.47885714, -17.46, -17.73342857, -17.028, -18.332,
-18.548, -18.22, -18.035, -17.138, -17.387, -18.314, -18.27,
-17.904, -18.497, -18.264, -18.593, -18.264, -18.008, -17.687,
-18.227, -17.849, -17.713, -18.017, -18.723, -18.793, -17.914,
-18.31, -18.116, -18.65, -17.587, -17.985, -18.793, -17.833,
-17.613, -17.942, -17.86, -17.749, -17.222, -17.286, -17.842,
-18.042, -17.912, -17.858, -18.916, -18.443, -18.638, -18.085,
-17.974, -17.997, -18.387, -18.129, -17.85, -18.699, -18.754,
-18.022, -18.636, -18.197, -18.645, -18.149, -18.157, -18.695,
-18.413, -17.978, -18.447, -17.854, -19.433, -18.251, -17.527,
-17.732, -18.42, -18.089, -17.498, -18.805, -17.677, -17.721,
-18.194, -18.063, -16.987, -18.34342857, -18.46185714, -17.56328571,
-17.84671429, -17.93814286, -18.10157143, -17.786, -17.78442857,
-17.38885714, -16.61228571, -15.97971429, -18.73614286, -18.26371429,
-18.98442857, -17.47014286, -18.12185714, -17.89457143, -18.17728571,
-18.234, -18.83871429, -18.82342857, -18.34314286, -18.43685714,
-18.66757143, -18.6295, -17.553, -17.72555609, -17.42890918,
-18.0937904, -17.3725821, -21.242, -20.107, -19.11, -17.771,
-18.125, -18.577, -17.781, -19.292, -16.776, -20.212, -20.539,
-17.972, -17.986, -18.634, -17.352, -17.409, -17.93, -17.458,
-17.53, -17.321, -17.11, -17.929, -17.244, -17.858, -17.251,
-18.06, -18.22, -18.142, -19.314, -18.412, -17.941, -17.909,
-18.114, -18.783, -18.181, -17.754, -18.484, -17.463, -18.379,
-18.19, -18.227, -17.414, -17.824, -17.436, -17.432, -17.171,
-17.483, -17.64, -17.639, -18.572, -18.545, -18.007, -18.033,
-18.102, -18.301, -17.731, -17.565, -17.68, -17.751, -18.134,
-18.409, -18.336, -18.888, -18.477, -18.25, -18.121, -18.082,
-17.914, -19.337, -19.228, -18.455, -18.657, -18.081, -18.23,
-18.777, -18.935, -18.823, -19.49, -18.383, -18.73, -18.152,
-18.582, -18.653, -18.407, -18.024, -18.994, -17.831, -17.947,
-17.57, -18.142, -17.691, -17.869, -18.513, -18.522, -17.923,
-18.353, -18.278, -17.664, -17.995, -17.786, -18.161, -18.119,
-18.125, -17.098, -17.576, -18.099, -18.713, -17.4, -17.622,
-17.532, -18.007, -18.146, -16.692, -18.678, -19.18, -18.522,
-18.572, -18.476, -19.144, -17.709, -17.742, -18.606, -18.267,
-18.543, -18.301, -19.117, -18.75, -19.394, -19.219, -18.179,
-18.681, -18.835, -18.456, -18.323, -18.148, -18.263, -17.965,
-19.337, -18.301, -19.046, -18.768, -18.017, -17.928, -17.314
), d15N = c(14.166, 14.279, 14.092, 13.464, 13.4, 13.179, 12.895,
13.537, 13.857, 13.775, 14.147, 12.017, 12.531, 12.329, 13.414,
13.777, 12.639, 13.135, 13.833, 13.68, 12.317, 12.237, 11.707,
12.318, 13.574, 14.77, 12.722, 13.772, 13.658, 13.804, 14.07,
15.182, 14.143, 13.54, 12.932, 13.77, 14.332, 12.642, 13.166,
12.412, 12.452, 14.09971429, 13.14, 13.643, 13.393, 13.759, 13.791,
13.244, 12.997, 13.86, 15.53828571, 14.42107143, 14.88228571,
13.32828571, 14.17421429, 12.94985714, 13.21614286, 11.18814286,
12.53371429, 12.67442857, 13.50585714, 12.64092857, 12.83257143,
12.03907143, 12.54642857, 13.70371429, 13.18142857, 14.76085714,
12.74385714, 13.7225, 11.76364286, 13.66457143, 12.65378571,
12.50114286, 14.27671429, 14.10342857, 14.3445, 11.72657143,
12.90221429, 14.71314286, 14.71907143, 14.04371429, 13.75092857,
13.74578571, 14.94164286, 13.07035714, 13.07685714, 12.8775,
13.86664286, 12.87185714, 13.75214286, 13.20285714, 12.46021429,
13.13914286, 13.82028571, 12.52585714, 13.4975, 12.88071429,
12.48042857, 14.29857143, 13.56214286, 13.41, 13.52985714, 13.55592857,
12.80007143, 12.91257143, 13.37457143, 13.60371429, 13.88671429,
13.44635714, 14.18214286, 10.09042857, 12.11571429, 13.00771429,
15.45157143, 13.33135714, 14.58378571, 11.78642857, 12.47628571,
14.46642857, 12.37064286, 13.44335714, 12.39628571, 14.08, 14.0505,
14.34, 14.0145, 13.926, 13.2355, 13.111, 12.3725, 13.888, 13.1075,
14.015, 14.9595, 12.857, 13.277, 12.457, 12.137, 13.124, 13.299,
12.811, 12.231, 11.829, 12.263, 13.036, 13.331, 12.76, 12.262,
14.026, 13.452, 13.769, 13.221, 13.059, 12.754, 12.637, 13.025,
15.123, 14.006, 12.605, 12.636, 14.229, 15.527, 11.583, 13.004,
12.851, 12.921, 12.273, 13.922, 13.429, 12.494, 13.803, 13.55,
13.387, 14.887, 14.248, 14.673, 14.603, 12.879, 12.4, 13.676,
13.648, 13.067, 13.353, 11.703, 14.118, 12.78, 12.293, 12.68,
13.494, 13.309, 13.838, 12.688, 14.418, 14.357, 14.587, 14.714,
14.435, 13.418, 13.013, 12.631, 12.704, 13.091, 12.953, 12.751,
12.409, 12.921, 12.216, 12.594, 12.698, 14.891, 14.692, 13.187,
13.451, 13.023, 11.957, 12.401, 12.527, 13.47, 11.771, 11.848,
12.399, 12.502, 12.678, 12.768, 12.716, 12.671, 12.61, 13.132,
12.999, 13.251, 11.048, 14.384, 12.688, 13.196, 12.875, 13.495,
12.895, 12.992, 12.888, 13.044, 14.195, 13.643, 13.042, 13.15,
13.437, 13.835, 14.884, 13.136, 14.384, 13.927, 14.914, 12.978,
12.841, 13.793, 14.312, 14.219, 14.36, 13.529, 11.837, 13.166,
13.103, 12.798, 13.529, 12.813, 9.574, 13.859, 12.548, 13.405,
12.6, 12.373, 12.964, 12.896, 13.067, 13.896, 14.533, 14.024,
13.042, 13.213, 13.857, 12.857, 12.393, 11.841, 13.702, 13.634,
14.391, 13.719, 13.181, 13.566, 13.314, 13.457, 12.871, 12.383,
13.62, 13.753, 13.388, 12.856, 14.408)), .Names = c("Species",
"Year", "Region", "d13C", "d15N"), class = "data.frame", row.names = c(NA,
-298L))
means <- ddply(GAP_Whales, .(Species, Year, Region), function(x) c(xmean=mean(x$d13C), xsd=sd(x$d13C), ymean=mean(x$d15N), ysd=sd(x$d15N)))
Species<-as.factor(means$Species)
Region<-as.factor(means$Region)
Year<-as.factor(means$Year)
p<-ggplot(means, aes(x=means$xmean, y=means$ymean))
p<-p+geom_point(aes(shape=factor(Year), color=factor(Region)))
p<-p + geom_errorbar(aes(ymin=ymean-ysd, ymax=ymean+ysd), width=.1)+
geom_errorbarh(aes(xmin=xmean-xsd, xmax=xmean+xsd), width=.1)
p

Related

x-axis labels not showing in ggplot

The x-axis labels isn't showing in my ggplot and I can't figure out what the issue is. I tried changing the scale_x_continuous to scale_x_discrete but that wasn't the issue. Here's the data and the code:
dput(df)
structure(list(variable = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "X..i..", class = "factor"),
value = c(0.86535786015671, 0.270518550067837, 0.942648772841964,
3.99444934081099, 1.11759146288817, 1.54510976425154, 2.44547105239855,
2.2564822479637, 0.806268193902794, 0.334684787222841, 0.279275582280181,
0.506202944652795, 0.00974858004556866, 0.274742461635902,
0.22071873199716, 0.289511637643534, 0.352185038116792, 0.834072418861261,
1.34338149120735, 1.74931508000265, 1.49348843361896, 4.07991249877895,
1.37225152308336, 0.812438174787708, 0.870119514197706, 1.12552827647611,
0.981401242191818, 0.811544940639505, 0.270314252804909,
0.00129424269740973, 0.138397649461267, 0.320412520877311,
0.200638317328505, 0.311317976283425, 2.27515845904203, 0.701130150695764,
1.19347381779438, 1.74260582346705, 2.04812451743241, 3.30525861365071,
1.09525257544341, 2.6941909849432, 1.24879308689346, 2.32559594481724,
0.489685734592222, 0.401412018111572, 0.209957274618462,
0.715330877881211, 0.844512982038313, 0.220417574806829,
0.440151738500053, 1.32486291268667, 0.771676730656983, 1.295145890213,
2.410181199299, 2.41520949303317, 2.07420663366187, 1.45105393420989,
1.94026424903487, 1.06019651909079, 1.21389399141063, 0.526835419170636,
0.392643071856425, 0.07366669912048, 0.376156996326127, 0.461881411637594,
0.236855843259622, 0.367884917633423), year = c(2005L, 2006L,
2007L, 2008L, 2009L, 2010L, 2011L, 2012L, 2013L, 2014L, 2015L,
2016L, 2017L, 2018L, 2019L, 2020L, 2021L, 2005L, 2006L, 2007L,
2008L, 2009L, 2010L, 2011L, 2012L, 2013L, 2014L, 2015L, 2016L,
2017L, 2018L, 2019L, 2020L, 2021L, 2005L, 2006L, 2007L, 2008L,
2009L, 2010L, 2011L, 2012L, 2013L, 2014L, 2015L, 2016L, 2017L,
2018L, 2019L, 2020L, 2021L, 2005L, 2006L, 2007L, 2008L, 2009L,
2010L, 2011L, 2012L, 2013L, 2014L, 2015L, 2016L, 2017L, 2018L,
2019L, 2020L, 2021L), tenor = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L), .Label = c("1", "5", "10", "average"), class = "factor")), row.names = c(NA,
-68L), class = "data.frame")
ggplot(df, aes(year, value, color = tenor)) +
geom_line(size=0.5) + scale_x_continuous(breaks = seq(1:17),labels = seq(2005,2021)) +
geom_point() +
xlab("year")
If you wanted to force ggplot to plot every x axis label, you could use scale_x_continous(breaks = 2005:2021) or breaks = df$year
ggplot(df, aes(year, value, color = tenor)) +
geom_line(size=0.5) +
scale_x_continuous(breaks = df$year) +
geom_point() +
xlab("year")

speed up modelling of subgroups in large data frame

I need to perform an analysis with glmer on many different subgroups of a large dataset and only extract the estimate and z-value of each model. This works perfectly fine if I only use a small subset of my data (or some dummy data, as attached below), but when I try to include the whole data set, it takes forever. Currently I am using this bit of code:
slope_range <- df %>%
group_by(region, year, species) %>%
summarise(slope = coef(summary(glmer(presence ~ transect + (1 | road), family = "binomial")))[2],
p_val = coef(summary(glmer(presence ~ transect + (1 | road), family = "binomial")))[6])
As I said, this works fine, but very slow on a large data set. I'm aware that I could also just write multiple loops, but I assume this would take even longer. Does anyone have a better solution of what could be done to make it faster? Thanks!
Dummy data:
> dput(df)
structure(list(region = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("ARG", "CHE"), class = "factor"),
transect = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), presence = c(1L, 1L,
1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 1L,
0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L,
1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 0L,
1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L,
1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L,
0L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L,
0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L), year = c(2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L), species = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("a", "b"), class = "factor"),
road = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("FG", "MK", "PL", "XY"), class = "factor")), class = "data.frame", row.names = c(NA,
-160L))
You are calling coef(summary(glmer(...))) twice for each group, so you can cut the execution time roughly in half by fitting the model and extracting the coefficients once for each group. The following code will extract all the coefficients and their Z and p-values, not just the two values you specified, which I think is preferable if you might end up needing them later. Of course it can be easily modified to discard the other coefficients and keep only the two you specified.
code
library(tidyverse)
library(lme4)
df %>%
group_by(region, year, species) %>%
group_modify(~ data.frame(variable = c('Intercept', 'transect'),
coef(summary(glmer(presence ~ transect + (1 | road), family = "binomial", data = .)))))
output
# A tibble: 16 x 8
# Groups: region, year, species [8]
region year species variable Estimate Std..Error z.value Pr...z..
<fct> <int> <fct> <fct> <dbl> <dbl> <dbl> <dbl>
1 ARG 2007 a Intercept 6.11 2.81 2.17 0.0300
2 ARG 2007 a transect -0.743 0.361 -2.06 0.0398
3 ARG 2007 b Intercept 1.91 1.22 1.57 0.116
4 ARG 2007 b transect -0.396 0.208 -1.90 0.0570
5 ARG 2017 a Intercept 3.95 1.73 2.28 0.0223
6 ARG 2017 a transect -0.654 0.275 -2.38 0.0174
7 ARG 2017 b Intercept 2.44 1.33 1.83 0.0668
8 ARG 2017 b transect -0.396 0.208 -1.90 0.0570
9 CHE 2007 a Intercept 3.95 1.73 2.28 0.0223
10 CHE 2007 a transect -0.654 0.275 -2.38 0.0174
11 CHE 2007 b Intercept 2.44 1.33 1.83 0.0668
12 CHE 2007 b transect -0.396 0.208 -1.90 0.0570
13 CHE 2017 a Intercept 6.11 2.81 2.17 0.0300
14 CHE 2017 a transect -0.743 0.361 -2.06 0.0398
15 CHE 2017 b Intercept 1.91 1.22 1.57 0.116
16 CHE 2017 b transect -0.396 0.208 -1.90 0.0570
You could use a parallel approach as suggested earlier, e.g. with parallel::mclapply (on my 6-core machine using more than 4 cores gave only marginal improvements, though).
You could speed up glmer using nAGQ=0, at the cost of precision (see https://stats.stackexchange.com/questions/132841/default-lme4-optimizer-requires-lots-of-iterations-for-high-dimensional-data).
Example code with benchmarks:
invisible(lapply(c("lme4", "data.table", "tidyverse", "parallel", "microbenchmark"),
require, character.only = TRUE))
#> Loading required package: lme4
#> Loading required package: Matrix
#> Loading required package: data.table
#> Loading required package: tidyverse
#> Loading required package: parallel
#> Loading required package: microbenchmark
df <- structure(list(region = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("ARG", "CHE"), class = "factor"),
transect = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), presence = c(1L, 1L,
1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 1L,
0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L,
1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 0L,
1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L,
1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L,
0L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L,
0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L), year = c(2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L), species = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("a", "b"), class = "factor"),
road = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("FG", "MK", "PL", "XY"), class = "factor")), class = "data.frame", row.names = c(NA,
-160L))
## Your function for comparison
tidy_fun <- function(){
df %>%
group_by(region, year, species) %>%
summarise(slope = coef(summary(glmer(presence ~ transect + (1 | road), family = "binomial")))[2],
p_val = coef(summary(glmer(presence ~ transect + (1 | road), family = "binomial")))[6])
}
gf2 <- function(presence, transect, road, nAGQ = 1L) {
res <- coef(summary(glmer(presence ~ transect + (1 | road), family = "binomial", nAGQ=nAGQ)))
return(data.table(slope=res[2], p_val=res[6]))
}
parLM <- function(mc.cores=4L, nAGQ=1L){
DT <- data.table(df, key = c("region","year","species"))
iDT <- DT[,by=.(region, year, species),.(irange=.(range(.I)))]
result <- mclapply(seq(nrow(iDT)),
function(x) DT[do.call(seq, as.list(iDT[x, irange][[1]])),
.(gf2(presence, transect, road, nAGQ=nAGQ))], mc.cores=mc.cores)
return(cbind(iDT, rbindlist(result))[,-4])
}
microbenchmark(
original = suppressMessages(tidy_fun()),
multicore = parLM(mc.cores = 4L, nAGQ = 1L),
singlecore.nAGQ0 = parLM(mc.cores = 1L, nAGQ = 0L),
multicore.nAGQ0 = parLM(mc.cores = 4L, nAGQ = 0L),
times=10L)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> original 898.2732 925.0621 963.7452 940.9577 973.0648 1157.0030 10
#> multicore 319.1234 334.4151 347.8024 344.1370 362.6539 373.8189 10
#> singlecore.nAGQ0 237.4782 245.4084 262.6290 268.1308 274.8516 280.7944 10
#> multicore.nAGQ0 132.3356 132.9963 137.2777 135.8659 141.5145 144.2564 10
#> cld
#> d
#> c
#> b
#> a

Automate coding (sum) in R

First at all I would like to apologise if I did not use the correct jargon.
I have the dataset as below which contains a wide range of categories
Here some excerpt from dput (using droplevels)
structure(list(
x = c(2010L, 2010L, 2010L, 2010L, 2010L, 2010L,
2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L,
2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L,
2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L,
2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L,
2010L, 2010L), *[ME: there are more years than 2010...]*
y = c(7.85986, 185.81068, 107.24097, 7094.74649,
1.4982, 185.77319, 5090.79354, 167.58584, 4189.64609, 157.08277,
3927.06932, 2.86732, 71.683, 4.70123, 117.53085, 2.93452, 73.36292,
1.4982, 18.18734, 901.14744, 0.90268, 13.77532, 613.38298, 0.01845,
0.0681, 7.19925, 3.75315, 0.14333, 136.54008, 0.04766, 0.59077,
28.97255, 0.38608, 115.05258, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
x1 = structure(c(4L, 2L, 3L, 1L, 4L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 4L, 2L, 1L, 4L, 2L, 1L, 4L, 2L,
1L, 2L, 4L, 1L, 4L, 2L, 1L, 4L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L), .Label = c("All greenhouse gases - (CO2 equivalent)",
"CH4", "CO2", "N2O"), class = "factor"),
x2 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Austria",
class = "factor"),
x4 = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 4L,
4L, 5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L, 8L, 9L, 9L, 9L, 10L,
10L, 10L, 11L, 11L, 11L, 12L, 12L, 12L, 13L, 13L, 14L, 14L,
15L, 15L, 16L, 16L, 17L, 17L, 18L, 18L), .Label = c("3",
"3.1", "3.A", "3.A.1", "3.A.2", "3.A.3", "3.A.4", "3.B",
"3.B.1", "3.B.2", "3.B.3", "3.B.4", "3.B.5", "3.C", "3.C.1",
"3.C.2", "3.C.3", "3.C.4"), class = "factor")), class = "data.frame",
row.names = c(NA,
-44L))
I want to know whether the of the sum of subcategories in x4 (e.g. 3.B.1+3.B.2+...+3.B.n) equal the figure stated in the parent category (e.g. 3.B). (i.e. the in the csv stated sum) for a given year and country. I want to verify the sums.
For get the sum of the subcategories I have this
sum(df$y[df$x4 %in% c("3.A.1", "3.A.2", "3.A.3", "3.A.4") & x ==
"2010" & x2 == "Austria"])
To receive the sum of the parent category I have this
sum(df$y[df$x4 %in% c("3.A") & x == "2010" & x2 == "Austria"])
Next I would need an operation which checks whether the results of both codes are equal (True/false). However, I have more than 20 countries, 20 years, dozens of categories to check. With my newby approach I would be writing code for ages...
is there anyway to automate this? Basically, I am looking for a code which is able to do the following
1) Run for one category, go to next one
2) once done with categories change year and start again with categories
3) ... same for countries....
Any sort of help would be appreciated and even a suggestions how to use the right jargon in the title. Thanks in any case
Here's a potential solution using dplyr (might require some tweaking based on the full dataset):
require(dplyr)
# Create two columns - one that shows only the parent category number, and one that tells you if it's a parent or child; note that the regex here makes some assumptions on the format of your data.
mutate(df,parent=gsub("(.?\\..?)\\..*", "\\1", df$x4),
type=ifelse(parent==x4,"Parent","Child")) %>%
# Sum the children y's by category, year and country
group_by(parent, type, x, x2) %>%
summarize(sum(y)) %>%
# See if the sum of the children is equal to the parent y
tidyr::spread(type,`sum(y)`) %>%
mutate(equals=isTRUE(all.equal(Child,Parent)))
Result using your (new) data:
parent x x2 Child Parent equals
<chr> <int> <fct> <dbl> <dbl> <lgl>
1 3 2010 Austria NA 7396. FALSE
2 3.1 2010 Austria NA 5278. FALSE
3 3.A 2010 Austria 4357. 4357. TRUE
4 3.B 2010 Austria 921. 921. TRUE
5 3.C 2010 Austria 0 0 TRUE
I can see from your new data that you have two levels of parents. My solution will only work for the second level (e.g. 3.1 and its children), but can be easily tweaked to also work for the top level.

r - line graph plot is not connected (financial data)

I'm trying to create a line graph using ggplot showing yearly funding received (y-variable=total_US_received; x-variable=year).
By the look of the graph I suspect the huge number of 0:s affect the outcome. Can these somehow be ignored?
The code I'm using is:
ggplot(countrylevel, aes(x=year, y=Total_US_received)) + geom_line()
The result I get looks like this
Any suggestions on how to improve the graph would be appreciated.
EDIT: opening the data frame with read_csv2 I got following result:
result using read_csv2
Here's my data (compressed):
structure(list(year = c(2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L, 2006L,
2006L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L), Total_US_required = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 167L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 104L, 2L,
2L, 2L, 2L, 63L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 4L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 47L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 151L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 111L, 2L, 2L, 2L, 2L, 34L,
2L, 2L, 2L, 2L, 37L, 2L, 2L, 2L, 141L, 129L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 96L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
117L, 2L, 36L, 2L, 143L, 53L, 2L, 2L, 2L, 2L, 2L, 25L, 2L, 2L,
2L, 7L, 35L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 185L, 2L,
2L, 2L, 2L, 99L, 2L, 2L, 206L, 2L, 2L, 2L, 12L, 2L, 2L, 2L, 2L,
2L, 2L, 124L), .Label = c("", "0", "1,32E+11", "1000000", "100000000",
"1005205443", "10072701,5", "10200000", "103784200,5", "1040147",
"104805000", "1071103", "10750000", "1090480364", "109153", "11060703",
"11068500,5", "11101695", "11354680", "1143893994", "11702455",
"11956538", "12120011", "12132493", "12410092", "12500000", "12552013",
"12808836", "129220163", "12940269", "1299891296", "1312327112",
"13316000", "137911265", "13796751", "13985000", "14300000",
"14457401", "14781209", "14840854", "1502221330", "1519195",
"15274599", "1537800", "15617154", "15710660", "16200000", "166500000",
"16701505", "168584522", "17054786", "1776123", "179338641",
"18173270,5", "18215196", "18333333,33", "18664384", "187181768",
"18922351", "19151715", "1918929", "19456115", "199823581", "20000000",
"200000000", "20020037", "201262789", "2026765", "20379635,5",
"204000000", "20564664", "20750000", "20800000", "2090176", "212054562",
"218740000", "220000000", "22024649", "22059615", "221897971",
"22212807", "2280767", "22815000", "23000000", "230197879", "23161089",
"2319522", "24980000", "251298009", "26000000", "26450000", "26547636",
"26566535", "2684877", "2768871", "2805517", "28472211", "28748910,75",
"28986609", "30000000", "30392419", "30486643", "306186910",
"3070192", "31170915", "32000000", "32818841", "33500000", "34328000",
"349570726", "35000000", "35274117", "35475942", "35637337",
"35732550", "36576823", "3691769,5", "37011827", "38112323",
"382458072", "38372246", "384900", "38600000", "38677251", "395551054",
"4013114", "40571983", "4165567", "4250000", "42592000", "4271640",
"43000000", "4485517", "4500000", "45553092", "45570852", "4600000",
"46358480", "477078000", "48500000", "48693043", "4969559", "49771737",
"5000000", "5047653", "52025932", "52534097", "54000000", "54300000",
"544088494", "546938906", "54712144,5", "553162739", "561892820",
"57000000", "57400000", "581079038", "582730988", "598981", "59962157",
"6000000", "603544533", "60589173,6", "606254626", "60750000",
"6120664", "61506194,5", "6250000", "632366288", "63762742,5",
"6431433", "651971301", "6594817,5", "6670000", "6671000", "67500000",
"67723257", "68000000", "68707342,5", "69900000", "7000000",
"70409614", "70534845", "7071951", "71502922", "718751,5", "72263676",
"72767987", "739204", "741000000", "76816821", "7712871", "774943253",
"776000000", "7804442", "782018", "78900000", "79310556", "798807640",
"8010000", "83045026", "839413371", "86000000", "87699664,5",
"9052940", "9114864", "980000", "98147205", "993069", "9945998",
"99676205"), class = "factor"), Total_US_received = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 156L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 39L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 54L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 5L, 2L, 2L, 2L,
2L, 202L, 2L, 2L, 2L, 26L, 144L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 96L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 200L, 2L, 198L,
2L, 87L, 137L, 2L, 2L, 2L, 2L, 2L, 213L, 2L, 2L, 2L, 165L, 139L,
2L, 2L, 2L, 51L, 2L, 2L, 2L, 2L, 2L, 2L, 8L, 2L, 2L, 2L, 2L,
89L, 2L, 2L, 124L, 2L, 2L, 2L, 12L, 2L, 2L, 2L, 65L, 2L, 2L,
105L), .Label = c("", "0", "1000000", "1015614409", "102196912",
"10286090", "1040146", "10494288,5", "10559092", "10601640",
"10660527", "1071103", "10864110", "10900000", "109153", "1107598204",
"11139934,33", "111645732", "115656", "11702455", "11782593",
"1200061", "1201967", "12115565", "12136102,5", "12515731", "12554644,5",
"12619592", "130290299", "13100400", "1321738", "133029804",
"133937699", "1353062", "13869184", "13957964", "14122864", "14196512",
"14390000", "14474887", "14603307", "146811271", "14803699",
"15000000", "1506314", "15099749", "15127408", "15154485", "1519195",
"1519283", "1523891,2", "153608778", "153702318", "153878654",
"15422351", "154810248", "1553005", "15600000", "15806841", "16043247",
"1613606", "16386883", "16688554", "16933060", "1693550", "172646900",
"172979780", "173803520", "17442963,4", "1750242", "1812580",
"18327716", "1864608", "18664384", "1884269", "19043876", "1913330",
"19208777", "1942999", "19500797", "19634851", "1972054", "1973161,05",
"1999890", "211080741", "21364744", "21478156", "21663285", "21842791",
"2190455", "22297547", "22573195", "227854767", "2278869", "2280790",
"2292085", "233449911", "23557901", "23609845", "2367794", "25966989",
"26225926", "26403842", "26507660", "26582033", "2677014", "2790598",
"28033068", "280678042", "2950462", "2955888125", "2968275",
"29708185", "30259405", "30663885", "311370,5", "3117063", "3123107",
"3136815", "31515034", "3219461", "325000000", "3308400", "3431553",
"34695229,5", "3500011", "35076700", "357250374", "3589825",
"3624829", "3652163", "367262924", "3684877", "37115540", "3739681",
"384833401", "38622368", "386544", "3868422", "4013114", "4094585",
"4165567", "420653055", "4250000", "42630065", "4268944", "4298100",
"43500000", "43625712", "4399059", "4457651", "45118120", "454765583",
"4610465", "46481136", "46521673,5", "467458781", "467833166",
"481844000", "4839160", "488626274", "5000000", "5185252", "5400292,23",
"5410657", "54325792", "54382350", "550434569", "559724655",
"56234467", "563955", "5662911", "5921234", "6120664", "61455087",
"6182013", "6233540", "6250000", "628473", "636467283", "6376123",
"6418934", "6431433", "6556366", "665294893", "6732558", "68472934,5",
"6861041", "6932070", "6959603", "6977580", "69861890", "707902098",
"709341604", "7107432", "75093648", "7511770", "7615691", "7615961",
"787785", "78900000", "7900000", "79310556", "8103006", "8137886",
"8141939,5", "82012623", "82324212", "8429768", "846703", "84731888",
"85542547", "8624021", "8635262", "872772", "9000002", "9132550",
"93241740", "9451311", "963065635", "9660536", "99993083"), class = "factor"),
Total_US_received_from.other = structure(c(2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("", "0", "100000", "1006501", "1074242,12",
"10743297", "10770310", "1120000", "11699639", "118666908",
"1192627", "11985500", "12107705", "12122971", "12387942",
"12486715", "12557462", "126051735", "126215100", "134355436",
"13666999", "137160", "14202727", "142432873", "14835929",
"150130445", "15392646", "15712570", "16879598", "16927130",
"17050442", "179590", "18794938", "1918087", "192628719",
"2005300", "20359990", "20366277", "208506791", "20955927",
"21057964", "212000", "22502219", "22505978", "23049962",
"2426857", "2540961", "25938524", "2644395", "27128997",
"28974835", "2907023", "295588125", "300000", "32333622",
"3243632", "3291445", "3407393", "365364799", "3662816",
"36641177", "41335774", "423000", "4250463", "42925711",
"436475", "43949647", "442174577", "4436964", "4503686",
"450800882", "455179863", "4600000", "4726938", "478644822",
"48043592", "4859884", "4927344", "50000", "500000", "517156572",
"5231571", "524000000", "5280672", "53344375", "543290",
"5511374", "5550000", "57689913", "579881", "6000000", "606155575",
"613505", "615717", "616075", "6270000", "628088534", "6564966",
"66602217", "6916265", "6948000", "69893696", "700208377",
"7161617", "728849", "7365738", "75291231", "75570838", "766794",
"76704121", "7786228", "7813200", "782593", "7866648", "78878806",
"79893942", "803922830", "8090000", "81536576", "82132042",
"8487981", "8910678", "89609843", "9362395", "936906", "968669",
"9858822", "989033335"), class = "factor"), Total_US_received.Emergency.response.plan_Common.humintarian.fund_country.based. = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L), .Label = c("", "0", "10000000", "103252715",
"121000", "13397050", "1400000", "1455021", "1501023", "1622837",
"180167,43", "1822402", "1963871", "250000", "25877890",
"2592709", "2787752", "300000", "3125951", "373134", "3751097",
"37812993", "38249502", "39642858", "43250552", "5000", "500000",
"50215100", "567540", "59264269", "6282314", "6500000", "706400",
"8080810", "82000000", "851664", "9963628"), class = "factor"),
Total_US_received_from.CERF = structure(c(2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 75L, 2L, 2L, 2L,
2L, 2L, 2L, 16L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 55L, 2L, 2L,
2L, 2L, 230L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 89L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 226L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 90L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
210L, 2L, 2L, 2L, 2L, 106L, 2L, 2L, 2L, 2L, 67L, 2L, 2L,
2L, 105L, 156L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 83L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 213L, 2L, 117L, 2L, 113L,
25L, 2L, 2L, 2L, 2L, 2L, 93L, 2L, 2L, 2L, 181L, 142L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 91L, 222L, 2L, 2L, 2L, 2L,
204L, 2L, 2L, 128L, 2L, 2L, 2L, 11L, 2L, 2L, 2L, 45L, 2L,
2L, 24L), .Label = c("", "0", "1000000", "1025464", "10329268",
"10371212", "10383240", "1040146", "10405408", "106597425",
"1070103", "1093884", "11006301", "11019952", "1107614",
"1110469,5", "11269445", "1142148", "11702455", "11721943",
"11999076", "1200061", "1201967", "12232994", "12365659",
"12455835", "1265586", "12985139,5", "1299825", "13001015",
"13347798", "13794282,2", "1400282", "1450242", "1501344",
"1503314", "1512075", "1516283", "15410406", "1553005", "15645398",
"1585201", "16690193", "16915466", "1693550", "17003929",
"17014450", "1747109", "1751722", "1798653", "18017685",
"1812580", "18512690", "1870201", "1905355", "1940932", "1942999",
"1949761", "1966070", "1967415", "1971425", "1972054", "1978455",
"1990385", "1999890", "1999893", "2000000", "2000095", "2000830",
"20029976", "2006304", "2024378", "2030597", "2032306", "2034923,5",
"2052680", "2069776", "2074049", "20889885", "2164278", "2179607",
"2221613", "2253044", "22683472", "2271877", "22839556",
"2294798", "2354123", "23686839", "24442232,5", "2472028,5",
"2485827", "2496956", "2503311", "2522639", "25230980", "2532163",
"25475033", "2577014", "2579188", "2584669", "25854204",
"25916415,5", "2624107", "2673988", "26746096", "2817063",
"2840964,5", "28437349", "2892761", "2996920", "3000002",
"3000909", "3010825", "3038387", "30853702", "3105354", "311370,5",
"3136815", "3141908", "31528040", "3166825", "3175302", "3245280,5",
"3291599", "3363540", "3376068", "3431553", "3454485", "3498910",
"3500011", "3556532", "3570457", "36564849", "3726365", "3738681",
"3788227", "3795458", "3851878", "3855356", "386544", "3868422",
"3879893", "3897864", "3994126", "3998941", "4005971", "4010361",
"4013114", "4019325", "4094585", "4096923,5", "416325", "4165567",
"4218944", "4250000", "4278128713", "4320080", "4390369",
"4457674", "4529592", "4679803", "4736155", "4782918", "4787929",
"4798922", "4829690", "4834117", "4839160", "4983445", "4989386",
"4997385", "5000000", "5051640", "5054519", "5102132", "5167962",
"5200000", "5352736", "5400292", "5410657", "5522753", "5537051",
"5580667", "563955", "5842338", "5972098", "5986081", "5988888",
"6000000", "60534026", "6141383", "6220011", "6239930", "6248385,5",
"628473", "6300221", "6376123", "6431433", "6450266", "6607366",
"6720259,5", "6932070", "7169618", "7170921", "718751,5",
"7367516", "751772", "7647237", "7716263", "773181", "777854",
"787785", "8015052", "8022382", "8110712", "8137886", "846703",
"8525924", "86172894", "8784528", "8949288,5", "9133227",
"9138176", "914395", "938652", "9568038", "9920625", "9958731",
"9972984"), class = "factor")), row.names = c(NA, 200L), class = "data.frame")
Starting from your data df, we can convert the factor fields into character, then replace the commas with decimals, then then convert that to numeric.
library(dplyr)
df2 <- df %>%
mutate_if(is.factor, as.character) %>%
mutate_if(is.character, ~sub(",",".",.) %>% as.numeric)
You might be interested in the totals for each year. Here's a way to total up each column by year:
df_annual_total <- df2
group_by(year) %>%
summarize_all(sum)

R looping with bp function

I have one dataset containing 3 columns: country, year and tdvalue.
I would like to make a loop by country to create a dummy variable (sd) having 1 or 0 if the year is a breakpoint by using R breakpoint function.
But when I make my code work my sd variable always equal 0, while I know that it is the case for some years?
Thanks a lot for your help!
library(zoo)
library(sandwich)
library(strucchange)
library(segmented)
library(tree)
tabo<-read.table("boucle.txt", header=T, sep="\t")
Fonction.bp<-function(b)
bp.inf <- breakpoints(tabo$year ~ tabo$tradevaluein1000usd , tabo = tabo[b,], h = 8)
t<-breakdates(confint(bp.inf))
for (i in 1:nrow(t)) {
res <- ifelse(tabo$year[b] == t[i,1] , 1, 0)
return(res)
}
}
numero<-1:nrow(tabo)
tabo$sd<-lapply(tabo$code_o,Fonction.bp)
data sample:
code_o -origin -year -tradevaluein1000usd
ABW Aruba 1988 375.059
ABW Aruba 1989 3458.656
ABW Aruba 1990 2924.484
ABW Aruba 1991 140509.4
etc for several countries
dput(tabo):
structure(list(code_o = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L), .Label = c("ABW", "AFG", "AGO"), class = "factor"),
origin = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Afghanistan", "Angola",
"Aruba"), class = "factor"), year = c(1988L, 1989L, 1990L,
1991L, 1992L, 1993L, 1994L, 1995L, 1996L, 1997L, 1998L, 1999L,
2000L, 2001L, 2002L, 2003L, 2004L, 2005L, 2006L, 2007L, 2008L,
2009L, 2010L, 2011L, 2012L, 1988L, 1989L, 1990L, 1991L, 1992L,
1993L, 1994L, 1995L, 1996L, 1997L, 1998L, 1999L, 2000L, 2001L,
2002L, 2003L, 2004L, 2005L, 2006L, 2007L, 2008L, 2009L, 2010L,
2011L, 2012L, 1988L, 1989L, 1990L, 1991L, 1992L, 1993L, 1994L,
1995L, 1996L, 1997L, 1998L, 1999L, 2000L, 2001L, 2002L, 2003L,
2004L, 2005L, 2006L, 2007L, 2008L, 2009L, 2010L, 2011L, 2012L
), tradevaluein1000usd = c(375.059, 3458.656, 2924.484, 140509.4,
326377, 548739.3, 570287.9, 673563.2, 809647.7, 1021996,
680243.7, 944974.8, 1950097, 1416807, 1055372, 1276015, 2503752,
3908081, 4294362, 4654180, 5523432, 2203173, 272596.5, 4450387,
127760.6, 121861.2, 125059.8, 134163.4, 115283.5, 82499.51,
68673.89, 97143.18, 104883.2, 124654.5, 155892.9, 167802.9,
137721, 153405.3, 99146.39, 103894.9, 190640.9, 209073.9,
264083.6, 254765.3, 408123.6, 507407, 1283451, 609946.1,
486418.4, 67638.02, 1112926, 3120863, 4082248, 3290223, 3796494,
3283747, 3175830, 3614761, 4669298, 4618304, 3501481, 4478671,
7878114, 6290144, 7344164, 8563406, 11900000, 20700000, 30200000,
39500000, 65700000, 38900000, 50600000, 59400000, 8839811
), sd = list(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0)), .Names = c("code_o", "origin", "year",
"tradevaluein1000usd", "sd"), row.names = c(NA, -75L), class = "data.frame")
You've got bad code.
If you expect people's time, you must put more effort in.
Your function doesn't work: tabo['AGO',], tabo['AFG',], and tabo['ABW',] all are empty, as there are no rows with these names. I think you probably want to subset the data, using something like:
tabo[tabo$code_o == 'AGO',]
tabo[tabo$code_o == 'AFG',]
tabo[tabo$code_o == 'ABW',]
bp.inf is the same regardless of if we include the code tabo = tabo[b,] -- as you're calling the items the the regression from the global environment, and not passing in a data-frame (as you're giving tabo rather than data). If this is confusing forget about it ...
The bottom line is that there are a few erros in the line that finds breakpoints. You need to change to something like bp.inf <- breakpoints(year ~ tradevaluein1000usd, data = tabo[tabo$code_o == 'AGO',], h = 8).
Note also, your function doesn't open with a { so it doesn't work at all.

Resources