dplyr get proportions - r

I have been trying to switch to dplyr and tidyr for my data manipulation (as opposed to data.table and excel). I have a dataframe in long format that looks like this:
TIME GEO geo_num sex_num AGE Value
2014 EU28 1 1 0 13486357
2014 EU28 1 1 5 13683976
2014 EU28 1 1 10 13430899
2014 EU28 1 1 15 13945295
2014 EU28 1 1 20 15417002
2014 EU28 1 1 25 16233349
What I want to obtain is the proportions by sex_num for each age group (AGE):
TIME GEO geo_num sex_num AGE Value percent
2014 EU28 1 1 0 13486357 0.537
2014 EU28 1 1 5 13683976 0.548
2014 EU28 1 1 10 13430899 0.537
2014 EU28 1 1 15 13945295 0.555
2014 EU28 1 1 20 15417002 0.613
2014 EU28 1 1 25 16233349 0.646
This way I would get the totals by sex (my denominator)
mydata %>%
group_by(geo_num,sex_num,TIME) %>%
summarize(total_sex=sum(Value))
But how to use it to get the percent is not totally clear
mydata %>%
group_by(sex_num, TIME, geo_num, AGE) %>%
mutate(freq = Value / total_sex)
Any ideas?
Here's a subset of the data
structure(list(X = 1:40, TIME = c(2014L, 2014L, 2014L, 2014L,
2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L,
2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L,
2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L,
2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L
), GEO = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "EU28", class = "factor"),
geo_num = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), GEO.1 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "European Union (28 countries)", class = "factor"),
SEX = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("Females", "Males"), class = "factor"), sex_num = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), AGE = c(0, 5, 10, 15,
20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 85.99,
90.99, 0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60,
65, 70, 75, 80, 85, 85.99, 90.99), Value = c(13486357L, 13683976L,
13430899L, 13945295L, 15417002L, 16233349L, 17074499L, 17647415L,
18386977L, 18914596L, 17914397L, 16416147L, 14854062L, 12613840L,
10126857L, 8102599L, 5377238L, 2718258L, 3756915L, 1038657L,
12805779L, 12992860L, 12754636L, 13227105L, 14824565L, 15915997L,
16894408L, 17437631L, 18269939L, 18931544L, 18244203L, 17188595L,
16064384L, 14111303L, 12145307L, 10862721L, 8471793L, 5480758L,
8448678L, 2967920L)), .Names = c("X", "TIME", "GEO", "geo_num",
"GEO.1", "SEX", "sex_num", "AGE", "Value"), class = "data.frame", row.names = c(NA, -40L))

Something like this might get you what you're looking for
mydata <- mydata %>%
group_by(TIME, GEO, geo_num, GEO.1, SEX, sex_num) %>%
mutate(total_sex = sum(Value),
percent = Value / total_sex * 100)
> head(mydata)
# A tibble: 6 x 11
X TIME GEO SEX AGE Value total_sex percent
1 2014 EU28 Males 0 13486357 251139335 5.370070
2 2014 EU28 Males 5 13683976 251139335 5.448759
3 2014 EU28 Males 10 13430899 251139335 5.347987
4 2014 EU28 Males 15 13945295 251139335 5.552812
5 2014 EU28 Males 20 15417002 251139335 6.138824
6 2014 EU28 Males 25 16233349 251139335 6.463881
# ... with 3 more variables

We can use data.table
library(data.table)
setDT(mydata)[, percent := 10*Value/sum(Value) , c(names(mydata)[2:7])]
head(mydata)
# X TIME GEO geo_num GEO.1 SEX sex_num AGE Value percent
#1: 1 2014 EU28 1 European Union (28 countries) Males 1 0 13486357 0.5370070
#2: 2 2014 EU28 1 European Union (28 countries) Males 1 5 13683976 0.5448759
#3: 3 2014 EU28 1 European Union (28 countries) Males 1 10 13430899 0.5347987
#4: 4 2014 EU28 1 European Union (28 countries) Males 1 15 13945295 0.5552812
#5: 5 2014 EU28 1 European Union (28 countries) Males 1 20 15417002 0.6138824
#6: 6 2014 EU28 1 European Union (28 countries) Males 1 25 16233349 0.6463881

Related

Error in R t_test , not enough "x" observations

I am trying to conduct group-wise t-test , but the code i am using returnign an error. It has worked alright for me previously and on other data frame but for this data frame its giving this error
Error in t.test.default(x = 0.0268, y = 0.0223, paired = FALSE,
var.equal = FALSE, : not enough 'x' observations
My Code is
stat.test.BACI5 <- Flaov %>%
group_by(`Treatment`) %>%
t_test(`Observed` ~ Control, detailed = TRUE) %>%
adjust_pvalue(method = "bonferroni") %>%
add_significance()
Here is the data structure
structure(list(Treatment = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("Phase1", "Phase2"), class = "factor"), Group = structure(c(3L,
4L, 2L, 3L, 2L, 4L, 1L, 2L, 4L, 3L, 1L, 2L, 1L, 2L, 1L, 1L, 2L,
1L, 2L, 1L, 1L, 1L, 4L, 2L, 3L, 2L, 4L, 3L, 1L, 2L, 4L, 1L, 3L,
1L, 1L, 1L, 2L, 1L, 3L, 2L, 1L, 2L, 3L, 1L, 1L, 1L, 2L, 2L, 2L,
4L, 2L, 1L, 1L, 1L, 4L, 1L, 3L, 1L, 3L, 4L, 2L, 1L, 1L, 2L, 4L,
2L, 3L, 1L, 1L, 2L), .Label = c("Group A ", "Group B", "Group C ",
"Group D"), class = "factor"), Observed = c(0.1057, 0.151, 0.0576,
0.1267, 0.0941, 0.1554, 0.0247, 0.0832, 0.2807, 0.1137, 0.0325,
0.0777, 0.0362, 0.0637, 0.0303, 0.0223, 0.0932, 0.0363, 0.0641,
0.0453, 0.0359, 0.0334, 0.2006, 0.0538, 0.1114, 0.0661, 0.2452,
0.1043, 0.0489, 0.0663, 0.1967, 0.0321, 0.1042, 0.0268, 0.0313,
0.0255, 0.0787, 0.038, 0.1212, 0.0839, 0.0446, 0.0986, 0.1364,
0.0335, 0.0409, 0.0407, 0.0871, 0.0584, 0.0875, 0.1961, 0.0711,
0.0191, 0.0363, 0.0474, 0.1608, 0.0349, 0.1099, 0.0399, 0.1095,
0.2011, 0.057, 0.0418, 0.0394, 0.054, 0.2033, 0.0631, 0.1089,
0.0441, 0.0261, 0.0686), Control = c(0.1061, 0.154, 0.0585, 0.1289,
0.1076, 0.15856, 0.02997, 0.1022, 0.2849, 0.1193, 0.03292, 0.0888,
0.04628, 0.06454, 0.03341, 0.0239, 0.1013, 0.0364, 0.0883, 0.06363,
0.0566, 0.04036, 0.20641, 0.06206, 0.1158, 0.0687, 0.2457, 0.12643,
0.05126, 0.05705, 0.1987, 0.04719, 0.08199, 0.02312, 0.0317,
0.07045, 0.06395, 0.06043, 0.1251, 0.0912, 0.04575, 0.1018, 0.1379,
0.03834, 0.048, 0.04131, 0.0926, 0.06242, 0.0965, 0.1972, 0.0742,
0.0211, 0.04318, 0.05741, 0.1616, 0.06552, 0.1104, 0.04814, 0.11015,
0.2081, 0.06341, 0.04329, 0.04486, 0.06179, 0.2114, 0.05545,
0.1127, 0.04327, 0.03355, 0.07189), factors = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), .Label = c("Phase1", "Phase2"), class = "factor")), row.names = c(NA,
70L), class = "data.frame")
If you are doing a t test between observed and control in the different treatment groups, the formula is wrong, the left hand side of the formula should be the response variable and right hand side should be grouping variable.
In your case, you need to pivot the data long to get something like this:
library(tidyr)
Flaov[,c("Treatment","Observed","Control")] %>%
pivot_longer(-c(Treatment)) %>% group_by(Treatment)
# A tibble: 140 x 3
# Groups: Treatment [2]
Treatment name value
<fct> <chr> <dbl>
1 Phase1 Observed 0.106
2 Phase1 Control 0.106
3 Phase1 Observed 0.151
4 Phase1 Control 0.154
5 Phase1 Observed 0.0576
6 Phase1 Control 0.0585
7 Phase1 Observed 0.127
8 Phase1 Control 0.129
9 Phase1 Observed 0.0941
10 Phase1 Control 0.108
# … with 130 more rows
Then we further pipe it to test:
Flaov[,c("Treatment","Observed","Control")] %>%
pivot_longer(-c(Treatment)) %>%
group_by(Treatment) %>%
t_test(value ~ name)
# A tibble: 2 x 9
Treatment .y. group1 group2 n1 n2 statistic df p
* <fct> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
1 Phase1 value Control Observed 46 46 0.482 90.0 0.631
2 Phase2 value Control Observed 24 24 0.323 46.0 0.748

speed up modelling of subgroups in large data frame

I need to perform an analysis with glmer on many different subgroups of a large dataset and only extract the estimate and z-value of each model. This works perfectly fine if I only use a small subset of my data (or some dummy data, as attached below), but when I try to include the whole data set, it takes forever. Currently I am using this bit of code:
slope_range <- df %>%
group_by(region, year, species) %>%
summarise(slope = coef(summary(glmer(presence ~ transect + (1 | road), family = "binomial")))[2],
p_val = coef(summary(glmer(presence ~ transect + (1 | road), family = "binomial")))[6])
As I said, this works fine, but very slow on a large data set. I'm aware that I could also just write multiple loops, but I assume this would take even longer. Does anyone have a better solution of what could be done to make it faster? Thanks!
Dummy data:
> dput(df)
structure(list(region = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("ARG", "CHE"), class = "factor"),
transect = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), presence = c(1L, 1L,
1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 1L,
0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L,
1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 0L,
1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L,
1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L,
0L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L,
0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L), year = c(2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L), species = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("a", "b"), class = "factor"),
road = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("FG", "MK", "PL", "XY"), class = "factor")), class = "data.frame", row.names = c(NA,
-160L))
You are calling coef(summary(glmer(...))) twice for each group, so you can cut the execution time roughly in half by fitting the model and extracting the coefficients once for each group. The following code will extract all the coefficients and their Z and p-values, not just the two values you specified, which I think is preferable if you might end up needing them later. Of course it can be easily modified to discard the other coefficients and keep only the two you specified.
code
library(tidyverse)
library(lme4)
df %>%
group_by(region, year, species) %>%
group_modify(~ data.frame(variable = c('Intercept', 'transect'),
coef(summary(glmer(presence ~ transect + (1 | road), family = "binomial", data = .)))))
output
# A tibble: 16 x 8
# Groups: region, year, species [8]
region year species variable Estimate Std..Error z.value Pr...z..
<fct> <int> <fct> <fct> <dbl> <dbl> <dbl> <dbl>
1 ARG 2007 a Intercept 6.11 2.81 2.17 0.0300
2 ARG 2007 a transect -0.743 0.361 -2.06 0.0398
3 ARG 2007 b Intercept 1.91 1.22 1.57 0.116
4 ARG 2007 b transect -0.396 0.208 -1.90 0.0570
5 ARG 2017 a Intercept 3.95 1.73 2.28 0.0223
6 ARG 2017 a transect -0.654 0.275 -2.38 0.0174
7 ARG 2017 b Intercept 2.44 1.33 1.83 0.0668
8 ARG 2017 b transect -0.396 0.208 -1.90 0.0570
9 CHE 2007 a Intercept 3.95 1.73 2.28 0.0223
10 CHE 2007 a transect -0.654 0.275 -2.38 0.0174
11 CHE 2007 b Intercept 2.44 1.33 1.83 0.0668
12 CHE 2007 b transect -0.396 0.208 -1.90 0.0570
13 CHE 2017 a Intercept 6.11 2.81 2.17 0.0300
14 CHE 2017 a transect -0.743 0.361 -2.06 0.0398
15 CHE 2017 b Intercept 1.91 1.22 1.57 0.116
16 CHE 2017 b transect -0.396 0.208 -1.90 0.0570
You could use a parallel approach as suggested earlier, e.g. with parallel::mclapply (on my 6-core machine using more than 4 cores gave only marginal improvements, though).
You could speed up glmer using nAGQ=0, at the cost of precision (see https://stats.stackexchange.com/questions/132841/default-lme4-optimizer-requires-lots-of-iterations-for-high-dimensional-data).
Example code with benchmarks:
invisible(lapply(c("lme4", "data.table", "tidyverse", "parallel", "microbenchmark"),
require, character.only = TRUE))
#> Loading required package: lme4
#> Loading required package: Matrix
#> Loading required package: data.table
#> Loading required package: tidyverse
#> Loading required package: parallel
#> Loading required package: microbenchmark
df <- structure(list(region = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("ARG", "CHE"), class = "factor"),
transect = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), presence = c(1L, 1L,
1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 1L,
0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L,
1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 0L,
1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L,
1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L,
0L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L,
0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L), year = c(2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L, 2007L,
2007L, 2007L, 2007L, 2007L, 2007L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L), species = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("a", "b"), class = "factor"),
road = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("FG", "MK", "PL", "XY"), class = "factor")), class = "data.frame", row.names = c(NA,
-160L))
## Your function for comparison
tidy_fun <- function(){
df %>%
group_by(region, year, species) %>%
summarise(slope = coef(summary(glmer(presence ~ transect + (1 | road), family = "binomial")))[2],
p_val = coef(summary(glmer(presence ~ transect + (1 | road), family = "binomial")))[6])
}
gf2 <- function(presence, transect, road, nAGQ = 1L) {
res <- coef(summary(glmer(presence ~ transect + (1 | road), family = "binomial", nAGQ=nAGQ)))
return(data.table(slope=res[2], p_val=res[6]))
}
parLM <- function(mc.cores=4L, nAGQ=1L){
DT <- data.table(df, key = c("region","year","species"))
iDT <- DT[,by=.(region, year, species),.(irange=.(range(.I)))]
result <- mclapply(seq(nrow(iDT)),
function(x) DT[do.call(seq, as.list(iDT[x, irange][[1]])),
.(gf2(presence, transect, road, nAGQ=nAGQ))], mc.cores=mc.cores)
return(cbind(iDT, rbindlist(result))[,-4])
}
microbenchmark(
original = suppressMessages(tidy_fun()),
multicore = parLM(mc.cores = 4L, nAGQ = 1L),
singlecore.nAGQ0 = parLM(mc.cores = 1L, nAGQ = 0L),
multicore.nAGQ0 = parLM(mc.cores = 4L, nAGQ = 0L),
times=10L)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> original 898.2732 925.0621 963.7452 940.9577 973.0648 1157.0030 10
#> multicore 319.1234 334.4151 347.8024 344.1370 362.6539 373.8189 10
#> singlecore.nAGQ0 237.4782 245.4084 262.6290 268.1308 274.8516 280.7944 10
#> multicore.nAGQ0 132.3356 132.9963 137.2777 135.8659 141.5145 144.2564 10
#> cld
#> d
#> c
#> b
#> a

Add column and row total in ftable

I use the ftable to make a table like this:
HPV-16 negative positive
Sex HPV-55
female negative 2341 4
positive 11 0
male negative 2140 23
positive 25 2
Here is the dput code.
structure(c(2341L, 11L, 2140L, 25L, 4L, 0L, 23L, 2L), .Dim = c(4L,
2L), class = "ftable", row.vars = list(Sex = c("female", "male"
), `HPV-55` = c("negative", "positive")), col.vars = list(`HPV-16` = c("negative",
"positive")))
And a sample data of the original data:
structure(list(sex = structure(c(2L, 2L, 1L, 1L, 2L, 1L, 2L,
2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("female",
"male"), class = c("labelled", "factor"), label = "sex"), orxh16 = structure(c(1L,
1L, 1L, NA, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("negative", "positive"), class = c("labelled",
"factor"), label = "hpv16"), orxh55 = structure(c(1L, 1L, 1L,
NA, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("negative", "positive"), class = c("labelled",
"factor"), label = "hpv55")), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))
I tried addmargins: addmargins(tab1, FUN = list(Total=sum), quiet = T), but the detailed information e.g., row names and the layout will be lost.
Total
2341 4 2345
11 0 11
2140 23 2163
25 2 27
Total 4517 29 4546
I'm wondering if there is a way to add the column and row total and meanwhile let the layout of the table looks like before (as below)? Thank you!
HPV-16 negative positive Total
Sex HPV-55
female negative 2341 4 2345
positive 11 0 11
male negative 2140 23 2163
positive 25 2 27
Total 4517 29 4546
addmargins should be used before ftable.
xtab1 <- xtabs(~ sex + orxh55 + orxh16, df)
ftable(addmargins(xtab1, margin = 2:3, list(Total = sum)))
# Margins computed over dimensions
# in the following order:
# 1: orxh55
# 2: orxh16
# orxh16 negative positive Total
# sex orxh55
# female negative 10 0 10
# positive 0 0 0
# Total 10 0 10
# male negative 9 0 9
# positive 0 0 0
# Total 9 0 9
Sample Data
df <- structure(list(sex = structure(c(2L, 2L, 1L, 1L, 2L, 1L, 2L,
2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("female",
"male"), class = c("labelled", "factor"), label = "sex"), orxh16 = structure(c(1L,
1L, 1L, NA, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("negative", "positive"), class = c("labelled",
"factor"), label = "hpv16"), orxh55 = structure(c(1L, 1L, 1L,
NA, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("negative", "positive"), class = c("labelled",
"factor"), label = "hpv55")), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))
Just found a way to do this, using summarytool::ctable. The tabulation is separated by the grouping variable though, it can provide similar tables as in the ftable.
tab2 <- with(hpv2, stby(list(x=orxh55, y=orxh16),
sex, ctable, prop="n", useNA="no", dnn = c("HPV_55", "HPV-16")))
Cross-Tabulation
HPV_55 * HPV-16
Data Frame: hpv2
Group: sex = female
---------- -------- ---------- ---------- -------
HPV-16 negative positive Total
HPV_55
negative 2341 4 2345
positive 11 0 11
Total 2352 4 2356
---------- -------- ---------- ---------- -------
Group: sex = male
---------- -------- ---------- ---------- -------
HPV-16 negative positive Total
HPV_55
negative 2140 23 2163
positive 25 2 27
Total 2165 25 2190

Converting from long to wide and creating multiple new column

I am trying to convert my data frame from a long to a wide format. Currently there is an InputCode column, which has Input A and B, and these need to be their own columns using values from 'DataValue'. Ive been trying spread and dcast,
data_wide <- spread(oldData_long, InputCode, DataValue)
or
data_wide2 <- dcast(oldData_long, Indicator + IndicatorID + InputName DataYear + Country + Division + InputUnit ~ InputCode, value.var="DataValue")
but the number of rows in my dataframe remains the same (84) instead of becoming 42, despite the creation of Input A and Input B columns. Whenever there is a value for Input A theres NA in the column for input B and vice versa.
Furthermore, ideally there would be an InputUnit column for each input Code, e.g 'InputAUnit', as this value will also be unique when trying to spread the data and might be causing my above problem. The same for InputName, but I have no idea how to also pull that information across neatly.
Any help would be greatly appreciated!!
dput:
structure(list(ID = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Indicator = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = "Waste Generated", class = "factor"), IndicatorID = c(11L,
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 11L), InputCode = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("InputA", "InputB"), class = "factor"), InputName = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("Waste Generated - Waste incinerated",
"Waste Generated - Waste sent to landfill"), class = "factor"),
DataValue = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 5L, 1L, 7L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
4L, 6L, 8L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 9L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 10L,
3L), .Label = c("0", "155", "19", "2,898.00", "20,462.34",
"22.317", "4.368", "40", "6,695.65", "8.998"), class = "factor"),
UnitCode = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = "t", class = "factor"), DataYear = c(2009L, 2009L,
2009L, 2009L, 2009L, 2009L, 2009L, 2009L, 2009L, 2009L, 2009L,
2009L, 2009L, 2009L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L,
2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2011L,
2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L,
2011L, 2011L, 2011L, 2011L, 2009L, 2009L, 2009L, 2009L, 2009L,
2009L, 2009L, 2009L, 2009L, 2009L, 2009L, 2009L, 2009L, 2009L,
2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L,
2010L, 2010L, 2010L, 2010L, 2010L, 2011L, 2011L, 2011L, 2011L,
2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L,
2011L), Country = structure(c(4L, 1L, 2L, 3L, 5L, 6L, 7L,
8L, 9L, 10L, 11L, 12L, 13L, 14L, 4L, 1L, 2L, 3L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 4L, 1L, 2L, 3L, 5L,
6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 4L, 1L, 2L, 3L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 4L, 1L, 2L,
3L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 4L, 1L,
2L, 3L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L), .Label = c("Afghanistan",
"Albania", "Algeria", "All", "American Samoa", "Andorra",
"Angola", "Antigua and Barbuda", "Argentina", "Armenia",
"Aruba", "Australia", "Austria", "Azerbaijan"), class = "factor"),
ISO = structure(c(5L, 2L, 4L, 14L, 9L, 6L, 3L, 10L, 7L, 8L,
1L, 11L, 12L, 13L, 5L, 2L, 4L, 14L, 9L, 6L, 3L, 10L, 7L,
8L, 1L, 11L, 12L, 13L, 5L, 2L, 4L, 14L, 9L, 6L, 3L, 10L,
7L, 8L, 1L, 11L, 12L, 13L, 5L, 2L, 4L, 14L, 9L, 6L, 3L, 10L,
7L, 8L, 1L, 11L, 12L, 13L, 5L, 2L, 4L, 14L, 9L, 6L, 3L, 10L,
7L, 8L, 1L, 11L, 12L, 13L, 5L, 2L, 4L, 14L, 9L, 6L, 3L, 10L,
7L, 8L, 1L, 11L, 12L, 13L), .Label = c("ABW", "AFG", "AGO",
"ALB", "ALL", "AND", "ARG", "ARM", "ASM", "ATG", "AUS", "AUT",
"AZE", "DZA"), class = "factor"), Division = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Test", class = "factor"),
FurtherDetails1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "Test1", class = "factor"), FurtherDetails2 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Test2", class = "factor")), class = "data.frame", row.names = c(NA,
-84L))
This would be the ideal output :
structure(list(ID = c(NA, NA, NA, NA, NA, NA), Indicator = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "Waste Generated", class = "factor"),
IndicatorID = c(11L, 11L, 11L, 11L, 11L, 11L), DataYear = c(2009L,
2009L, 2009L, 2009L, 2009L, 2009L), Country = structure(c(4L,
1L, 2L, 3L, 5L, 6L), .Label = c("Afghanistan", "Albania",
"Algeria", "All", "American Samoa", "Andorra", "Angola",
"Antigua and Barbuda", "Argentina", "Armenia", "Aruba", "Australia",
"Austria", "Azerbaijan"), class = "factor"), ISO = structure(c(5L,
2L, 4L, 14L, 9L, 6L), .Label = c("ABW", "AFG", "AGO", "ALB",
"ALL", "AND", "ARG", "ARM", "ASM", "ATG", "AUS", "AUT", "AZE",
"DZA"), class = "factor"), Division = structure(c(1L, 1L,
1L, 1L, 1L, 1L), .Label = "Test", class = "factor"), FurtherDetails1 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "Test1", class = "factor"),
FurtherDetails2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "Test2", class = "factor"),
InputA = c(0L, 0L, 0L, 0L, 0L, 0L), InputAUnit = structure(c(2L,
2L, 2L, 2L, 2L, 2L), .Label = c("", "t"), class = "factor"),
InputAName = structure(c(2L, 2L, 2L, 2L, 2L, 2L), .Label = c("",
"Waste Generated - Waste sent to landfill"), class = "factor"),
InputB = c(0L, 0L, 0L, 0L, 0L, 0L), InputBUnit = structure(c(2L,
2L, 2L, 2L, 2L, 2L), .Label = c("", "t"), class = "factor"),
InputBName = structure(c(2L, 2L, 2L, 2L, 2L, 2L), .Label = c("",
"Waste Generated - Waste incinerated"), class = "factor")), row.names = c(NA,
6L), class = "data.frame")
Thanks!!
A possible tidyr solution.
library(tidyr)
out <- pivot_wider(oldData_long, names_from = InputCode, values_from = c(DataValue, UnitCode, InputName))
out
# A tibble: 42 x 15
ID Indicator IndicatorID DataYear Country ISO Division FurtherDetails1 FurtherDetails2 DataValue_InputA DataValue_InputB
<lgl> <fct> <int> <int> <fct> <fct> <fct> <fct> <fct> <fct> <fct>
1 NA Waste Ge… 11 2009 All ALL Test Test1 Test2 0 0
2 NA Waste Ge… 11 2009 Afghan… AFG Test Test1 Test2 0 0
3 NA Waste Ge… 11 2009 Albania ALB Test Test1 Test2 0 0
4 NA Waste Ge… 11 2009 Algeria DZA Test Test1 Test2 0 0
5 NA Waste Ge… 11 2009 Americ… ASM Test Test1 Test2 0 0
6 NA Waste Ge… 11 2009 Andorra AND Test Test1 Test2 0 0
7 NA Waste Ge… 11 2009 Angola AGO Test Test1 Test2 0 0
8 NA Waste Ge… 11 2009 Antigu… ATG Test Test1 Test2 0 0
9 NA Waste Ge… 11 2009 Argent… ARG Test Test1 Test2 0 0
10 NA Waste Ge… 11 2009 Armenia ARM Test Test1 Test2 0 0
# … with 32 more rows, and 4 more variables: UnitCode_InputA <fct>, UnitCode_InputB <fct>, InputName_InputA <fct>, InputName_InputB <fct>
str(out)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 42 obs. of 15 variables:
$ ID : logi NA NA NA NA NA NA ...
$ Indicator : Factor w/ 1 level "Waste Generated": 1 1 1 1 1 1 1 1 1 1 ...
$ IndicatorID : int 11 11 11 11 11 11 11 11 11 11 ...
$ DataYear : int 2009 2009 2009 2009 2009 2009 2009 2009 2009 2009 ...
$ Country : Factor w/ 14 levels "Afghanistan",..: 4 1 2 3 5 6 7 8 9 10 ...
$ ISO : Factor w/ 14 levels "ABW","AFG","AGO",..: 5 2 4 14 9 6 3 10 7 8 ...
$ Division : Factor w/ 1 level "Test": 1 1 1 1 1 1 1 1 1 1 ...
$ FurtherDetails1 : Factor w/ 1 level "Test1": 1 1 1 1 1 1 1 1 1 1 ...
$ FurtherDetails2 : Factor w/ 1 level "Test2": 1 1 1 1 1 1 1 1 1 1 ...
$ DataValue_InputA: Factor w/ 10 levels "0","155","19",..: 1 1 1 1 1 1 1 1 1 1 ...
$ DataValue_InputB: Factor w/ 10 levels "0","155","19",..: 1 1 1 1 1 1 1 1 1 1 ...
$ UnitCode_InputA : Factor w/ 1 level "t": 1 1 1 1 1 1 1 1 1 1 ...
$ UnitCode_InputB : Factor w/ 1 level "t": 1 1 1 1 1 1 1 1 1 1 ...
$ InputName_InputA: Factor w/ 2 levels "Waste Generated - Waste incinerated",..: 2 2 2 2 2 2 2 2 2 2 ...
$ InputName_InputB: Factor w/ 2 levels "Waste Generated - Waste incinerated",..: 1 1 1 1 1 1 1 1 1 1 ...

sum and conditionally count based on a second column

I have gotten frustrated trying to solve this seemingly simple problem. I have a dataset (df) like this:
structure(list(Year = c(2015L, 2015L, 2015L, 2015L, 2015L, 2015L,
2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L,
2015L, 2015L, 2015L, 2015L, 2015L), Unknown = c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), Temp = c(21L, 21L, 21L, 23L, 23L, 21L, 21L, 22L, 21L, 23L,
23L, 22L, 21L, 21L, 22L, 22L, 21L, 21L, 23L, 23L), Obs = structure(c(1L,
1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L,
1L, 2L, 2L), .Label = c("mdk", "sde"), class = "factor"), State = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = "ma", class = "factor"), Zone = c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), Segment = c(8L, 7L, 4L, 17L, 18L, 7L, 2L, 12L, 1L, 17L,
18L, 12L, 9L, 7L, 13L, 11L, 8L, 9L, 17L, 18L), Subseg = c(1L,
3L, 3L, 2L, 2L, 2L, 4L, 0L, 10L, 4L, 2L, 0L, 1L, 1L, 3L, 1L,
2L, 2L, 1L, 1L), Wdir = structure(c(2L, 2L, 1L, 3L, 3L, 2L, 2L,
1L, 2L, 3L, 3L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L), .Label = c("na",
"ne", "nw"), class = "factor"), Wvel = structure(c(1L, 1L, 2L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L,
2L), .Label = c("5", "na"), class = "factor"), Clouds = structure(c(1L,
1L, 3L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 1L, 3L, 3L, 1L,
1L, 3L, 3L), .Label = c("1", "4", "na"), class = "factor"), Temp.1 = structure(c(1L,
1L, 3L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 1L, 3L, 3L, 1L,
1L, 3L, 3L), .Label = c("20", "25", "na"), class = "factor"),
Species = structure(c(7L, 21L, 1L, 21L, 16L, 4L, 16L, 6L,
1L, 17L, 5L, 7L, 5L, 1L, 1L, 6L, 7L, 7L, 24L, 5L), .Label = c("ABDU",
"ABDU", "ABDU", "ABDU", "ABDU", "CAGO", "CAGO", "CAGO", "CAGO",
"CAGO", "GOLD", "GOLD", "GOLD", "GOLD", "GOLD", "MERG", "MERG",
"MERG", "MERG", "MERG", "SCOT", "SCOT", "SCOT", "SCOT",
"SCOT", "SCOT", "SCOT"), class = "factor"), Count = c(5L,
1L, 150L, 3L, 20L, 8L, 5L, 10L, 5L, 1L, 20L, 10L, 2L, 2L,
80L, 40L, 1L, 1000L, 2L, 20L)), .Names = c("Year", "Unknown",
"Temp", "Obs", "State", "Zone", "Segment", "Subseg", "Wdir",
"Wvel", "Clouds", "Temp.1", "Species", "Count"), row.names = c(666L,
614L, 2060L, 1738L, 1459L, 536L, 197L, 2467L, 98L, 1794L, 1449L,
2464L, 696L, 483L, 2644L, 2350L, 686L, 844L, 2989L, 2934L), class = "data.frame")
With a header that looks like this:
Year Unknown Temp Obs State Zone Segment Subseg Wdir Wvel
666 2015 1 21 mdk ma 2 8 1 ne 5
614 2015 1 21 mdk ma 2 7 3 ne 5
2060 2015 1 21 sde ma 2 4 3 na na
1738 2015 1 23 mdk ma 2 17 2 nw 5
1459 2015 1 23 mdk ma 2 18 2 nw 5
536 2015 1 21 mdk ma 2 7 2 ne 5
Clouds Temp.1 Species Count
666 1 20 CAGO 5
614 1 20 SCOT 1
2060 na na ABDU 150
1738 1 20 SCOT 3
1459 1 20 MERG 20
536 1 20 ABDU 8
Among other things within dplyr, I want to get a sum of each species as a new column, when I am grouping by segment. This is the final code I have tried with many variations.
df_group = df %>%
group_by(Segment) %>%
summarise(temp = round(mean(Temp)),
WDir = round(mean(Wdir)),
ABDU = sum(which(Species=="ABDU"),Count),
CAGO = sum(which(Species=="CAGO"),Count),
GOLD = sum(which(Species=="GOLD"),Count),
MERG = sum(which(Species=="MERG"),Count),
SCOT = sum(which(Species=="SCOT"),Count))
And this is what I get (to show correct format):
Segment temp WDir ABDU CAGO GOLD MERG SCOT
1 1 21 2 6 5 5 5 5
2 2 21 2 5 5 5 6 5
3 4 21 1 151 150 150 150 150
4 7 21 2 16 11 11 11 12
5 8 21 2 6 9 6 6 6
6 9 21 2 1003 1004 1002 1002 1002
The format and general idea are what I want, but the numbers are not adding up the way I want them to. I'm sure it is simple but need some help! Thanks.
The problem is that which returns a vector of the positions, but you're not using those to subset. So the sum you are getting is of the positions which are true in addition to the count variable. e.g.
x <- c("a", "b", "b")
count <- c(10, 11, 12)
sum(which(c("a", "b", "b") == "b"), count)
# 38 because it is 2 + 3 + 10 + 11 + 12
I believe what you want is (or at least one way of writing it):
sum(ifelse(x == "b", count, 0))
# 23 because it is equal to 0 + 11 + 12
Translating into dplyr syntax, your example could look like this:
df_group = df %>%
group_by(Segment) %>%
summarise(temp = round(mean(Temp)),
WDir = round(mean(Wdir)),
ABDU = sum(ifelse(Species=="ABDU", Count, 0L)),
CAGO = sum(ifelse(Species=="CAGO", Count, 0L)),
GOLD = sum(ifelse(Species=="GOLD", Count, 0L)),
MERG = sum(ifelse(Species=="MERG", Count, 0L)),
SCOT = sum(ifelse(Species=="SCOT", Count, 0L)))
Another approach, in case you don't want to type out the sum for all your species:
library(reshape2)
library(dplyr)
# I had a problem with duplicate factor levels from your dput,
# so I re-factored species
df$Species = as.factor(as.character(df$Species))
species.counts = select(df, Segment, Species, Count) %>%
dcast(formula = Segment ~ Species, value.var = "Count", fun.aggregate = sum)
> head(species.counts)
Segment ABDU CAGO MERG SCOT
1 1 5 0 0 0
2 2 0 0 5 0
3 4 150 0 0 0
4 7 10 0 0 1
5 8 0 6 0 0
6 9 2 1000 0 0
df %>% group_by(Segment) %>%
summarise(temp = round(mean(Temp))) %>%
left_join(species.counts)
Source: local data frame [11 x 6]
Segment temp ABDU CAGO MERG SCOT
1 1 21 5 0 0 0
2 2 21 0 0 5 0
3 4 21 150 0 0 0
4 7 21 10 0 0 1
5 8 21 0 6 0 0
6 9 21 2 1000 0 0
I also couldn't do the wind direction average, because your dput data only has that as a factor with the directions, not like the head() you showed, but the technique generalizes.

Resources