mean for subsets using survey - r

I have two variables "c" and "q" in a data.frame. "c" is a number between zero and one (a level of poverty) and "q" indicate if the household (or subject) is poor with 1 or non-poverty with zero.
How can I calculate the mean of "c" only of the poor households (q=1).
What I need
Important detail: I have a database for a coutry and I want this result for regions.
I am using the svyby like this:
svyby( ~q , ~region , design = base2015_pos , na.rm=TRUE, svytotal)
so in that way the R give me the number of poor by region and I don't need this now. I need the mean of a subset (see image above) by region.
structure(list(domicilio = c(11000015001, 11000015003, 11000015004), agua = c(0, 0, 6), ind_agua = c(0, 0, 1), esgoto = c(1, 1, 6), ind_cond_sanitaria = c(1, 1, 1), lixo = c(0, 0, 0), ind_lixo = c(0, 0, 0), luz = c(0, 0, 0), ind_iluminacao = c(0, 0, 0), ativos = c(0, 0, 0), ind_ativos = c(0, 0, 0), emprego = c(0, 0, 0), ind_emprego = c(0, 0, 0), renda = c(0, 0, 0), ind_renda = c(0, 0, 0), casa = c(1, 1, 0), ind_riqueza = c(1, 1, 0), anos = c(0, 0, 0), ind_estudo = c(0, 0, 0), ler = c(0, 0, 0), ind_alfabetizado = c(0, 0, 0), peso = c(270, 270, 270), sexo = c(0, 1, 1), uf = c("11", "11", "11"), v4609 = c("001772940", "001772940", "001772940"), v4617 = c(110001, 110001, 110001), v4618 = c(1, 1, 1), pre_wgt = c(200, 200, 200), one = c(1L,
1L, 1L), region = c("1", "1", "1"), c = c(0.2, 0.2, 0.2), q = c(0, 0, 0)), .Names = c("domicilio", "agua", "ind_agua", "esgoto", "ind_cond_sanitaria", "lixo", "ind_lixo", "luz", "ind_iluminacao","ativos", "ind_ativos", "emprego", "ind_emprego", "renda", "ind_renda", "casa", "ind_riqueza", "anos", "ind_estudo", "ler", "ind_alfabetizado","peso", "sexo", "uf", "v4609", "v4617", "v4618", "pre_wgt", "one", "region", "c", "q"), row.names = c(NA, 3L), class = "data.frame")

# complex sample survey design
library(survey)
# your data.frame
x <- structure(list(domicilio = c(11000015001, 11000015003, 11000015004), agua = c(0, 0, 6), ind_agua = c(0, 0, 1), esgoto = c(1, 1, 6), ind_cond_sanitaria = c(1, 1, 1), lixo = c(0, 0, 0), ind_lixo = c(0, 0, 0), luz = c(0, 0, 0), ind_iluminacao = c(0, 0, 0), ativos = c(0, 0, 0), ind_ativos = c(0, 0, 0), emprego = c(0, 0, 0), ind_emprego = c(0, 0, 0), renda = c(0, 0, 0), ind_renda = c(0, 0, 0), casa = c(1, 1, 0), ind_riqueza = c(1, 1, 0), anos = c(0, 0, 0), ind_estudo = c(0, 0, 0), ler = c(0, 0, 0), ind_alfabetizado = c(0, 0, 0), peso = c(270, 270, 270), sexo = c(0, 1, 1), uf = c("11", "11", "11"), v4609 = c("001772940", "001772940", "001772940"), v4617 = c(110001, 110001, 110001), v4618 = c(1, 1, 1), pre_wgt = c(200, 200, 200), one = c(1L,
1L, 1L), region = c("1", "1", "1"), c = c(0.2, 0.2, 0.2), q = c(0, 0, 0)), .Names = c("domicilio", "agua", "ind_agua", "esgoto", "ind_cond_sanitaria", "lixo", "ind_lixo", "luz", "ind_iluminacao","ativos", "ind_ativos", "emprego", "ind_emprego", "renda", "ind_renda", "casa", "ind_riqueza", "anos", "ind_estudo", "ler", "ind_alfabetizado","peso", "sexo", "uf", "v4609", "v4617", "v4618", "pre_wgt", "one", "region", "c", "q"), row.names = c(NA, 3L), class = "data.frame")
# your survey.design (this is not the correct svydesign statement, please follow the directions specific to your data set)
y <- svydesign( ~ 1 , data = x , weights = ~ pre_wgt )
# your desired subset
z <- subset( y , q == 1 )
# your desired mean
svyby( ~ c , ~ region , z , svymean )

aggregate(df$c, by=list(df$q), FUN=mean)

Here's another possibility. To illustrate, create a dataset per your parameters:
set.seed(787)
dat.a <-runif(n=10,min=0,max=1)
dat.b <-rbinom(n=10, size=1, prob=0.5)
dat.1 <-data.frame(matrix(c(dat.a, dat.b), ncol=2, nrow=10))
colnames(dat.1) <-c("c","q")
dat.1
c q
1 0.35326234 1
2 0.45277055 0
3 0.29505270 0
4 0.78723105 1
5 0.95915348 1
6 0.17505284 0
7 0.79693672 0
8 0.01648420 1
9 0.02706417 0
10 0.93996311 1
Now subset by extracting all rows that match q=1 and compute mean for column c in resulting output:
dat.1.subset <-dat.1[dat.1$q==1,]
mean(dat.1.subset$c)
[1] 0.6112188

Related

Sorting barplot based on multi-categories in r

I am trying to get a bar plot for sentiment scores corrected as per the following order and put into two separate colors:
(NEGATIVE) anger, disgust, fear, sadness, negative --- (POSITIVE) anticipation, joy, surprise, trust, positive.
Below is the code which only gives a decreasing plot.
barplot(sort(colSums(s), decreasing = TRUE),
las = 2,
col = rainbow(2),
ylab = 'Count',
main = 'User Synergies')
> dput(head(s))
structure(list(anger = c(1, 0, 0, 0, 0, 0), anticipation = c(0,
0, 5, 0, 0, 0), disgust = c(0, 0, 0, 0, 0, 0), fear = c(1, 0,
2, 1, 0, 0), joy = c(1, 0, 1, 0, 0, 0), sadness = c(1, 0, 2,
1, 0, 0), surprise = c(0, 0, 2, 1, 0, 0), trust = c(4, 2, 3,
1, 0, 1), negative = c(2, 0, 3, 2, 1, 1), positive = c(4, 4,
7, 1, 0, 2)), row.names = c(NA, 6L), class = "data.frame")
Another way:
positive <- c("anticipation", "joy", "surprise", "trust", "positive")
negative <- c("anger", "disgust", "fear", "sadness", "negative")
barplot(colSums(s[,c(negative, positive)]),
las = 2,
col = c(rep("red", length(negative)), rep("cyan", length(positive))),
ylab = 'Count', ylim = c(0, 20),
main = 'User Synergies')
The result:
Try this ,
df <- structure(list(anger = c(1, 0, 0, 0, 0, 0),
anticipation = c(0, 0, 5, 0, 0, 0),
disgust = c(0, 0, 0, 0, 0, 0),
fear = c(1, 0,2, 1, 0, 0),
joy = c(1, 0, 1, 0, 0, 0),
sadness = c(1, 0, 2, 1, 0, 0),
surprise = c(0, 0, 2, 1, 0, 0),
trust = c(4, 2, 3, 1, 0, 1),
negative = c(2, 0, 3, 2, 1, 1),
positive = c(4, 4,7, 1, 0, 2)),
row.names = c(NA, 6L), class = "data.frame")
pn <- rainbow(2) # "#FF0000" "#00FFFF" one for positive and the other for negative
s <- sort(colSums(df) , decreasing = TRUE)
names(s)
#> [1] "positive" "trust" "negative" "anticipation" "fear"
#> [6] "sadness" "surprise" "joy" "anger" "disgust"
# arrange colors based on names of sorted columns
col <- c(pn[1] , pn[1] , pn[2] , pn[1] , pn[2] ,
pn[2] , pn[1] , pn[1] , pn[2] , pn[2])
barplot(s ,
las = 2,
col = col,
ylab = 'Count',
main = 'User Synergies')
Created on 2022-05-31 by the reprex package (v2.0.1)
You may try
library(dplyr)
library(reshape2)
df <- data.frame(
anger = 200,
disgust = 100,
fear = 900,
sadness = 400,
negative = 1500,
anticipation = 2000,
joy = 1200,
surprise = 300,
trust = 2500,
positive = 5000
)
pall <- c("red", "blue")
colSums(df) %>%
melt %>%
tibble::rownames_to_column(., "sentiments") %>%
mutate(sentiments = factor(sentiments, levels = c("anger", "disgust", "fear", "sadness", "negative", "anticipation", "joy", "surprise", "trust", "positive"))) %>%
mutate(colo = ifelse(sentiments %in% c("anger", "disgust", "fear", "sadness", "negative"), 0, 1) %>% as.factor) %>%
barplot(data = ., value ~ sentiments, col = pall[.$colo], las = 2, xlab = "")
Another approach :
df <- structure(list(anger = c(1, 0, 0, 0, 0, 0),
anticipation = c(0, 0, 5, 0, 0, 0),
disgust = c(0, 0, 0, 0, 0, 0),
fear = c(1, 0,2, 1, 0, 0),
joy = c(1, 0, 1, 0, 0, 0),
sadness = c(1, 0, 2, 1, 0, 0),
surprise = c(0, 0, 2, 1, 0, 0),
trust = c(4, 2, 3, 1, 0, 1),
negative = c(2, 0, 3, 2, 1, 1),
positive = c(4, 4,7, 1, 0, 2)),
row.names = c(NA, 6L), class = "data.frame")
s <- sort(colSums(df) , decreasing = TRUE)
pos <- c("positive" , "trust" , "anticipation" ,
"surprise" , "joy")
col <- names(s)
col <- ifelse(col %in% pos , "cyan" , "red")
barplot(s ,
las = 2,
col = col,
ylab = 'Count',
main = 'User Synergies')
Created on 2022-05-31 by the reprex package (v2.0.1)

How to work with annual country data in R?

It's my first time using R. I want to create a scatterplot with a line of best fit for a decade of data about all countries. I joined two excel datasets - one has the number of people jailed for a certain crime by country in a given year (rows: country, columns year:, the other has average income for a certain population group (rows: country, columns: year).
dataclean=inner_join(EnforcementData, IncomeData, by = "Country")
This gives me a dataset with x, y points where enforcement is the x and income is the y
I want to plot this and find the outliers - so those countries where enforcement is out of step with income. I tried:
ggplot(dataclean, aes(x=EnforcementData, y=IncomeData, group= "Country")) +
geom_line(aes(color = "Country")
Thanks for any suggestions!
EDIT: I think I've improperly merged the datasets somehow, as it returns a matrix. Like this:
dput(head(dataclean))
structure(list(Country = c("Albania", "Algeria", "Angola", "Antigua and Barbuda",
"Argentina", "Armenia"), 2006.x = c(0, 0, 0, 0, 0, 0), 2007.x = c(0,
0, 0, 0, 0, 0), 2008.x = c(0, 0, 0, 0, 3, 0), 2009.x = c(0,
0, 0, 0, 2, 0), 2010.x = c(0, 0, 3, 0, 0, 0), 2011.x = c(0,
0, 0, 0, 4, 0), 2012.x = c(0, 0, 0, 0, 2, 0), 2013.x = c(1,
1, 3, 0, 3, 0), 2014.x = c(0, 0, 0, 0, 1, 0), 2015.x = c(0,
0, 1, 1, 0, 0), 2016.x = c(0, 0, 5, 1, 5, 0), 2017.x = c(0,
0, 3, 0, 0, 0), 2018.x = c(0, 0, 0, 0, 0, 0), 2019.x = c(0,
1, 3, 0, 0, 0), 2020.x = c(0, 1, 0, 0, 0, 0), 2006.y = c(3.273755,
2.9912451, 3.689971, 1.342365, 2.8111637, 3.1407325), 2007.y = c(3.157699,
3.0298389, 3.759603, 1.315153, 2.8102016, 3.2122944), 2008.y = c(3.0636166,
3.0644794, 3.754531, 1.181255, 2.9054865, 3.1780076), 2009.y = c(3.0084051,
3.0477934, 3.874565, 1.144331, 2.9149061, 3.0896677), 2010.y = c(2.9951254,
2.9948973, 3.796005, 1.161454, 2.8314702, 3.1664003), 2011.y = c(3.1528966,
3.0144704, 3.814187, 1.190574, 2.8360401, 3.1267727), 2012.y = c(3.1964009,
2.9731618, 3.73838, 1.201921, 2.913096, 3.0577149), 2013.y = c(3.1683419,
2.943247, 3.779373, 1.209151, 2.9020493, 3.0017037), 2014.y = c(3.0180735,
3.0699088, 3.913854, 1.8298544, 3.0114942, 2.9938708), 2015.y = c(2.9489451,
3.1155215, 3.864924, 1.7799824, 3.0169873, 3.0037498), 2016.y = c(2.8750588,
3.1476701, 3.909438, 1.7761061, 2.7538409, 3.041738), 2017.y = c(2.8906318,
3.0717401, 3.880863, 2.2256225, 2.7280908, 3.0332232), 2018.y = c(2.9485421,
3.12678, 3.609102, 2.1923678, 2.5386973, 2.8175096), 2019.y = c(3.0029988,
3.0910585, 3.524361, 2.1915031, 2.5461976, 2.6481938), 2020.y = c(1.9297139,
3.1117555, 3.3970031, 2.1946293, 2.5862916, 2.438313)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))

Decision tree and error matrix calculations

I've created a decision tree using rpart and the code below:
res.tree <- rpart(myformula, data = credit_train)
my data has been subset into 2 parts. The training part at 70% and a testing part at 30%.
This part works well and my tree is created. Where I'm getting stuck is with the prediction so that I can calculate my confusion matrix and ROC curves.
I'm using this code tree_pred = predict(res.tree, credit_train, type = "class")
but I get this message:
Error in predict.rpart(res.tree, credit_test, type = "class") : Invalid prediction for "rpart" object
In addition:
Warning message:
'newdata' had 271 rows but variables found have 729 rows
I can't figure out if I don't have a library loaded or what is causing the it not to recognize the type, which is what so many resources say I need to use and why I'm getting a mismatch in the rows.
The 'newdata' at 271 rows is what my testing data set has and my training data-set has 729 rows.
Is the decision tree creation causing my problem or could it be the prediction code?
Responding to comments:
I'm using the following libraries:
library(readxl)
library(dplyr)
library(factoextra)
library(corrplot)
library(rpart)
library(rpart.plot)
library(RColorBrewer)
library(pROC)
library(Hmisc)
library(fBasics)
library(rattle)
library(caret)
A sample of my data:
structure(list(CHK_ACCT = c(0, 1, 0, 0), DURATION = c(6, 48,
42, 24), HISTORY = c(4, 2, 2, 3), NEW_CAR = c(0, 0, 0, 1), USED_CAR = c(0,
0, 0, 0), FURNITURE = c(0, 0, 1, 0), `RADIO/TV` = c(1, 1, 0,
0), EDUCATION = c(0, 0, 0, 0), RETRAINING = c(0, 0, 0, 0), AMOUNT = c(1169,
5951, 7882, 4870), SAV_ACCT = c(4, 0, 0, 0), EMPLOYMENT = c(4,
2, 3, 2), INSTALL_RATE = c(4, 2, 2, 3), MALE_DIV = c(0, 0, 0,
0), MALE_SINGLE = c(1, 0, 1, 1), MALE_MAR_or_WID = c(0, 0, 0,
0), `CO-APPLICANT` = c(0, 0, 0, 0), GUARANTOR = c(0, 0, 1, 0),
PRESENT_RESIDENT = c(4, 2, 4, 4), REAL_ESTATE = c(1, 1, 0,
0), PROP_UNKN_NONE = c(0, 0, 0, 1), AGE = c(67, 22, 45, 53
), OTHER_INSTALL = c(0, 0, 0, 0), RENT = c(0, 0, 0, 0), OWN_RES = c(1,
1, 0, 0), NUM_CREDITS = c(2, 1, 1, 2), JOB = c(2, 2, 2, 2
), NUM_DEPENDENTS = c(1, 1, 2, 2), TELEPHONE = c(1, 0, 0,
0), FOREIGN = c(0, 0, 0, 0), DEFAULT = c(0, 1, 0, 1), CHK_ACCT_rec = c(1,
2, 1, 1), SAV_ACCT_rec = c(0, 1, 1, 1)), .Names = c("CHK_ACCT",
"DURATION", "HISTORY", "NEW_CAR", "USED_CAR", "FURNITURE", "RADIO/TV",
"EDUCATION", "RETRAINING", "AMOUNT", "SAV_ACCT", "EMPLOYMENT",
"INSTALL_RATE", "MALE_DIV", "MALE_SINGLE", "MALE_MAR_or_WID",
"CO-APPLICANT", "GUARANTOR", "PRESENT_RESIDENT", "REAL_ESTATE",
"PROP_UNKN_NONE", "AGE", "OTHER_INSTALL", "RENT", "OWN_RES",
"NUM_CREDITS", "JOB", "NUM_DEPENDENTS", "TELEPHONE", "FOREIGN",
"DEFAULT", "CHK_ACCT_rec", "SAV_ACCT_rec"), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
myformula = credit_train$DEFAULT ~ credit_train$CHK_ACCT_rec +
credit_train$DURATION + credit_train$HISTORY + credit_train$NEW_CAR +
credit_train$USED_CAR + credit_train$FURNITURE + credit_train$`RADIO/TV` +
credit_train$EDUCATION + credit_train$RETRAINING + credit_train$AMOUNT +
credit_train$SAV_ACCT_rec + credit_train$EMPLOYMENT +
credit_train$INSTALL_RATE + credit_train$MALE_DIV + credit_train$MALE_SINGLE
+ credit_train$MALE_MAR_or_WID + credit_train$`CO-APPLICANT` +
credit_train$GUARANTOR + credit_train$PRESENT_RESIDENT +
credit_train$REAL_ESTATE + credit_train$PROP_UNKN_NONE + credit_train$AGE +
credit_train$OTHER_INSTALL + credit_train$RENT + credit_train$OWN_RES +
credit_train$NUM_CREDITS + credit_train$JOB + credit_train$NUM_DEPENDENTS +
credit_train$TELEPHONE + credit_train$FOREIGN
#calimo I hope this is what you needed.

How to customize the tooltip of ggplotly?

I have tried to follow different answers here but none worked. I went through the plotly official documentation and came up with following:
Data
Following is a sample of the data set:
> dput(head(df))
structure(list(ID = c(-1, -1, -1, -1, -1, -1), spacing.ft = c(0,
0, 0, 0, 0, 0), gap.s = c(0, 0, 0, 0, 0, 0), frspacing.ft = c(0,
0, 0, 0, 0, 0), TTC = c(0, 0, 0, 0, 0, 0), LV.vel.fps = c(0,
0, 0, 0, 0, 0), x = c(0, 0, 0, 0, 0, 0), y = c(0, 0, 0, 0, 0,
0), z = c(0, 0, 0, 0, 0, 0), frames = 29373:29378, df16 = c(6L,
6L, 6L, 6L, 6L, 6L), ADO.name = structure(c(NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_), .Label = c("BlueT5",
"ghtTFrei10", "ilT6Carg", "owT8Yell", "CargoT4", "MoveT12", "RaceT11",
"RedT1", "SemiT3", "StarT7", "WhiteT2", "artTWalm9"), class = "factor"),
speed.fps.ED = c(33.25, 33.4, 33.55, 33.7, 33.84, 33.99),
deltaV.fps = c(33.25, 33.4, 33.55, 33.7, 33.84, 33.99)), .Names = c("ID",
"spacing.ft", "gap.s", "frspacing.ft", "TTC", "LV.vel.fps", "x",
"y", "z", "frames", "df16", "ADO.name", "speed.fps.ED", "deltaV.fps"
), row.names = c(NA, 6L), class = "data.frame")
What I want to do:
I want to customize the tooltip to add speed, speed.fps.ED. I tried following:
library(ggplot2)
library(plotly)
mt.plot <- ggplot() +
geom_point(data = df,
mapping = aes(x = deltaV.fps, y = frspacing.ft, color = ADO.name))
# Build the ggplot:
p <- plotly_build(mt.plot)
# Change the tooltip:
p$data[[1]]$text <- paste("ED.speed = ", df$speed.fps.ED)
p$filename <- 'test'
r <- plotly_POST(p)
knit_print.plotly(r, options=list())
You can see the resulting plot here: Plot.
Problem
The problem is that the third element in the tooltip is displayed only for 1 ADO.name i.e. BlueT5. I want it to be visible for all ADO.names. What is the problem here?
You can add speed.fps.ED to the ggplot aesthetic, as in:
geom_point(data = df,
aes(x = deltaV.fps, y = frspacing.ft, color = ADO.name, label = speed.fps.ED))
See also: how to choose variable to display in tooltip when using ggplotly?

Using aggregate on list in R

I have a list (lst3, subset below) and would like to do some calculations on it, e.g.:
lst4 <-lapply(lst3, function(x) aggregate(x[,5:ncol(x)], x[c(4)], FUN = mean)) #column means
lst5<-lapply(lst4,function(x) apply(x[,-c(1)],1,mean)) # get row mean
However, I am unable to get row mean without ignoring "Site".
I would like my final list to look like this:
lst5<-
[[1]]
Site x
G116 1.864233
[[2]]
Site x
GG16 2.064567
The essence is that the final list should have the above structure so that I can write my data to working directory using:
lapply(lst5,function(x)write.table(x,file=paste(getwd(),"summer",paste0(unique(x$Site),".csv"),
sep="/"),row.names=FALSE,quote=FALSE)) ### create a folder called "summer" and write files to directory###
Thanks,
AZ.
list(structure(list(Year = c(2005L, 2005L, 2005L), Month = c(8L,
8L, 8L), Day = 29:31, Site = structure(c(1L, 1L, 1L), .Label = "G116", class = "factor"),
Sim001 = c(8.4, 17.72, 6.03), Sim002 = c(0.27, 0, 0), Sim003 = c(2.83,
0.14, 0.1), Sim004 = c(0, 0, 0), Sim005 = c(0, 0.77, 0.28
), Sim006 = c(0, 0, 0), Sim007 = c(0, 0, 0), Sim008 = c(10.94,
4.77, 0), Sim009 = c(0, 0, 0), Sim010 = c(3.43, 2.74, 0.65
), Sim011 = c(0.36, 0, 2.75), Sim012 = c(26.91, 0, 2.16),
Sim013 = c(0.88, 1.33, 0.87), Sim014 = c(0, 0.86, 9.42),
Sim015 = c(0, 0.17, 1.15), Sim016 = c(0, 0, 0), Sim017 = c(0.13,
0, 0), Sim018 = c(0, 0, 6.72), Sim019 = c(8.45, 12.99, 23.72
), Sim020 = c(1.76, 0, 0), Sim021 = c(0, 0, 2.34), Sim022 = c(0,
0, 0), Sim023 = c(1.2, 0, 0.26), Sim024 = c(0.85, 0, 0),
Sim025 = c(0, 0, 0), Sim026 = c(2.05, 0.76, 5.03), Sim027 = c(0.78,
0, 0), Sim028 = c(1.2, 0, 0), Sim029 = c(22, 0.19, 0), Sim030 = c(0.12,
0, 0), Sim031 = c(3.1, 13.67, 0), Sim032 = c(0, 0, 17.88),
Sim033 = c(0, 0, 0), Sim034 = c(1.11, 0, 0), Sim035 = c(1.17,
1.41, 23.35), Sim036 = c(0, 0.48, 1.71), Sim037 = c(1.51,
11.1, 7.98), Sim038 = c(0, 0, 0), Sim039 = c(0, 0, 5.46),
Sim040 = c(5.21, 0, 0), Sim041 = c(0.1, 0.11, 0), Sim042 = c(0,
0.15, 5.23), Sim043 = c(0, 0, 0), Sim044 = c(0, 0.1, 0),
Sim045 = c(0, 0, 0), Sim046 = c(0, 0, 0), Sim047 = c(0, 0,
0.11), Sim048 = c(0, 0, 0), Sim049 = c(0, 0, 4.05), Sim050 = c(0,
0, 0), Sim051 = c(0, 0.12, 0), Sim052 = c(0.24, 2.58, 0),
Sim053 = c(3.63, 0, 0.17), Sim054 = c(10.94, 2.69, 0), Sim055 = c(0,
0, 0), Sim056 = c(0.24, 0.44, 8.27), Sim057 = c(0, 0, 0),
Sim058 = c(0, 0, 3.75), Sim059 = c(0.19, 11.06, 0), Sim060 = c(0,
0, 1.65), Sim061 = c(0, 4.95, 0), Sim062 = c(0.15, 0, 4.73
), Sim063 = c(2.99, 0.12, 1.28), Sim064 = c(0, 0, 0), Sim065 = c(0,
0, 0), Sim066 = c(0, 0, 0), Sim067 = c(0.11, 0.62, 0.56),
Sim068 = c(2.84, 0, 0), Sim069 = c(0, 0, 0), Sim070 = c(17.91,
0.11, 4.78), Sim071 = c(0, 0, 1.68), Sim072 = c(0, 0, 1.38
), Sim073 = c(1.68, 0, 0), Sim074 = c(0.53, 0, 2.87), Sim075 = c(0,
0, 0), Sim076 = c(2.58, 0.27, 0.11), Sim077 = c(0, 0, 0),
Sim078 = c(9.07, 3.13, 8.62), Sim079 = c(0.98, 0, 2.38),
Sim080 = c(3.4, 0, 0), Sim081 = c(0, 0, 4.57), Sim082 = c(1.87,
2.86, 0), Sim083 = c(21.76, 2.24, 0), Sim084 = c(0.45, 4.03,
0.39), Sim085 = c(0, 0, 0), Sim086 = c(0, 0, 0), Sim087 = c(0,
0, 17.12), Sim088 = c(5.05, 0, 0), Sim089 = c(0, 0, 1.4),
Sim090 = c(0.1, 0, 0), Sim091 = c(1.96, 0, 1.38), Sim092 = c(0,
0, 0), Sim093 = c(0, 0, 0), Sim094 = c(0, 0, 1.81), Sim095 = c(2.72,
7.16, 1.7), Sim096 = c(6.37, 0, 0), Sim097 = c(0, 1.12, 25.7
), Sim098 = c(0, 0, 0), Sim099 = c(0, 0, 0), Sim100 = c(6.77,
10.87, 2.6)), .Names = c("Year", "Month", "Day", "Site",
"Sim001", "Sim002", "Sim003", "Sim004", "Sim005", "Sim006", "Sim007",
"Sim008", "Sim009", "Sim010", "Sim011", "Sim012", "Sim013", "Sim014",
"Sim015", "Sim016", "Sim017", "Sim018", "Sim019", "Sim020", "Sim021",
"Sim022", "Sim023", "Sim024", "Sim025", "Sim026", "Sim027", "Sim028",
"Sim029", "Sim030", "Sim031", "Sim032", "Sim033", "Sim034", "Sim035",
"Sim036", "Sim037", "Sim038", "Sim039", "Sim040", "Sim041", "Sim042",
"Sim043", "Sim044", "Sim045", "Sim046", "Sim047", "Sim048", "Sim049",
"Sim050", "Sim051", "Sim052", "Sim053", "Sim054", "Sim055", "Sim056",
"Sim057", "Sim058", "Sim059", "Sim060", "Sim061", "Sim062", "Sim063",
"Sim064", "Sim065", "Sim066", "Sim067", "Sim068", "Sim069", "Sim070",
"Sim071", "Sim072", "Sim073", "Sim074", "Sim075", "Sim076", "Sim077",
"Sim078", "Sim079", "Sim080", "Sim081", "Sim082", "Sim083", "Sim084",
"Sim085", "Sim086", "Sim087", "Sim088", "Sim089", "Sim090", "Sim091",
"Sim092", "Sim093", "Sim094", "Sim095", "Sim096", "Sim097", "Sim098",
"Sim099", "Sim100"), row.names = 15947:15949, class = "data.frame"),
structure(list(Year = c(2005L, 2005L, 2005L), Month = c(8L,
8L, 8L), Day = 29:31, Site = structure(c(1L, 1L, 1L), .Label = "GG16", class = "factor"),
Sim001 = c(18.36, 0.33, 0.14), Sim002 = c(0, 10.92, 0
), Sim003 = c(0, 0, 0), Sim004 = c(0, 0, 1.7), Sim005 = c(0,
0, 0), Sim006 = c(0.91, 4.24, 0), Sim007 = c(0, 0, 0.22
), Sim008 = c(0.63, 2.9, 2.24), Sim009 = c(0, 0, 0),
Sim010 = c(0, 0, 6.91), Sim011 = c(0, 3.28, 10.18), Sim012 = c(8.39,
14.58, 45.62), Sim013 = c(2.87, 0.53, 0.11), Sim014 = c(9.15,
21.1, 0.66), Sim015 = c(0, 1.75, 2.2), Sim016 = c(0,
7.86, 0), Sim017 = c(0, 0, 0), Sim018 = c(0, 0, 0), Sim019 = c(0,
0, 0), Sim020 = c(0.39, 0, 0), Sim021 = c(0.13, 0, 1.05
), Sim022 = c(0, 0, 10.91), Sim023 = c(0.23, 0, 0), Sim024 = c(0.12,
0.83, 5.35), Sim025 = c(0, 0, 0), Sim026 = c(7.75, 0,
4.82), Sim027 = c(20.04, 0, 0), Sim028 = c(12.41, 0,
5.3), Sim029 = c(0, 0, 0), Sim030 = c(0, 0, 0), Sim031 = c(0,
8.06, 0), Sim032 = c(0, 0, 0), Sim033 = c(0, 0, 0), Sim034 = c(0.1,
0, 3.34), Sim035 = c(0, 4.34, 3.53), Sim036 = c(2.89,
0.27, 0), Sim037 = c(0, 0, 0), Sim038 = c(0, 0, 0), Sim039 = c(0,
0.11, 0), Sim040 = c(9.83, 1.55, 9.09), Sim041 = c(3.6,
0, 0), Sim042 = c(0, 0, 1.37), Sim043 = c(0, 0, 0), Sim044 = c(0,
0, 0), Sim045 = c(0, 0, 0), Sim046 = c(0, 0, 0), Sim047 = c(0,
20.52, 0.65), Sim048 = c(1.77, 0.67, 0), Sim049 = c(0,
0, 0), Sim050 = c(0, 0, 0), Sim051 = c(0, 4.9, 0), Sim052 = c(0.71,
11.34, 0), Sim053 = c(3.46, 2.59, 1.5), Sim054 = c(0,
23.63, 0), Sim055 = c(0, 16.48, 4.99), Sim056 = c(0,
0, 0), Sim057 = c(0, 0, 0), Sim058 = c(0, 0, 0), Sim059 = c(0,
0, 0), Sim060 = c(16.87, 0, 0), Sim061 = c(0, 3.43, 0
), Sim062 = c(0.45, 0, 0), Sim063 = c(0, 11.14, 7.22),
Sim064 = c(0, 0, 0), Sim065 = c(0, 0, 0), Sim066 = c(0,
16.08, 1.87), Sim067 = c(0, 0, 0), Sim068 = c(5.16, 0.88,
0.1), Sim069 = c(0, 0, 3.91), Sim070 = c(0, 0, 0), Sim071 = c(0.17,
0, 5.22), Sim072 = c(0, 0, 6.95), Sim073 = c(0, 0, 0),
Sim074 = c(0.14, 0, 0), Sim075 = c(0, 0, 0), Sim076 = c(0,
9.62, 0), Sim077 = c(0, 0, 0), Sim078 = c(1.65, 0, 0),
Sim079 = c(0.23, 8.41, 0.28), Sim080 = c(0.78, 0, 0),
Sim081 = c(0, 0, 0), Sim082 = c(0.11, 2.75, 0), Sim083 = c(0.26,
7.34, 5.92), Sim084 = c(0, 0, 4.27), Sim085 = c(0, 0,
0), Sim086 = c(0, 0, 0.1), Sim087 = c(27.18, 0.72, 28.29
), Sim088 = c(0, 0, 4.2), Sim089 = c(0, 9.37, 6.59),
Sim090 = c(0.21, 2.57, 0), Sim091 = c(0.45, 0, 0), Sim092 = c(0,
4.97, 0), Sim093 = c(1.43, 0, 0), Sim094 = c(0, 0, 2.15
), Sim095 = c(6, 0, 1.63), Sim096 = c(7.21, 0, 0), Sim097 = c(0,
0.39, 1.92), Sim098 = c(0, 0, 0), Sim099 = c(4.38, 0,
0), Sim100 = c(0, 0, 0)), .Names = c("Year", "Month",
"Day", "Site", "Sim001", "Sim002", "Sim003", "Sim004", "Sim005",
"Sim006", "Sim007", "Sim008", "Sim009", "Sim010", "Sim011",
"Sim012", "Sim013", "Sim014", "Sim015", "Sim016", "Sim017",
"Sim018", "Sim019", "Sim020", "Sim021", "Sim022", "Sim023",
"Sim024", "Sim025", "Sim026", "Sim027", "Sim028", "Sim029",
"Sim030", "Sim031", "Sim032", "Sim033", "Sim034", "Sim035",
"Sim036", "Sim037", "Sim038", "Sim039", "Sim040", "Sim041",
"Sim042", "Sim043", "Sim044", "Sim045", "Sim046", "Sim047",
"Sim048", "Sim049", "Sim050", "Sim051", "Sim052", "Sim053",
"Sim054", "Sim055", "Sim056", "Sim057", "Sim058", "Sim059",
"Sim060", "Sim061", "Sim062", "Sim063", "Sim064", "Sim065",
"Sim066", "Sim067", "Sim068", "Sim069", "Sim070", "Sim071",
"Sim072", "Sim073", "Sim074", "Sim075", "Sim076", "Sim077",
"Sim078", "Sim079", "Sim080", "Sim081", "Sim082", "Sim083",
"Sim084", "Sim085", "Sim086", "Sim087", "Sim088", "Sim089",
"Sim090", "Sim091", "Sim092", "Sim093", "Sim094", "Sim095",
"Sim096", "Sim097", "Sim098", "Sim099", "Sim100"), row.names = 15947:15949, class = "data.frame"))
You can go from lst3 directly to lst5 without the intermediate aggregate step:
lapply(lst3, function(df){
data.frame(Site = df$Site[1], x = mean(unlist(df[-c(1:4)])))
})
#[[1]]
# Site x
#1 G116 1.864233
#
#[[2]]
# Site x
#1 GG16 2.064567
Since you're calculating the mean of all columns except the first 4 columns and over all the rows of the other columns, it's quite easy to unlist the data, creating a single vector, and then using standard mean on it. Also, by skipping the lst4 step, this most likely be noticeably faster.
Or, as commented by Richard, a variation could be:
lapply(lst3, function(df){
data.frame(Site = df$Site[1], x = mean(colMeans(df[-c(1:4)])))
})
Benchmark:
library(microbenchmark)
microbenchmark(
f1 = {lapply(lst3, function(df){
data.frame(Site = df$Site[1], x = mean(unlist(df[-c(1:4)])))
})},
f2 = {lapply(lst3, function(df){
data.frame(Site = df$Site[1], x = mean(colMeans(df[-c(1:4)])))
})},
unit = "relative"
)
Unit: relative
expr min lq median uq max neval
f1 1.00000 1.000000 1.000000 1.000000 1.000000 100
f2 2.91545 2.937272 2.927799 2.894704 3.486007 100
Here's another option for your consideration:
library(reshape2)
x <- melt(lst3)
aggregate(value ~ Site, x[grepl("^Sim.*", x$variable),], FUN = mean)
# Site value
#1 G116 1.864233
#2 GG16 2.064567
Or the same concept but using dplyr:
library(dplyr)
filter(x, grepl("^Sim.*", variable)) %>% group_by(Site) %>% summarise(x = mean(value))
#Source: local data frame [2 x 2]
#
# Site x
#1 G116 1.864233
#2 GG16 2.064567
Of course, this could also be done using data.table, for example like this (there are probably several even slightly more efficient ways to do this in data.table):
library(data.table)
setDT(x)[grepl("^Sim.*", variable), list(x = mean(value)), by = Site]
# Site x
#1: G116 1.864233
#2: GG16 2.064567

Resources