I am trying to get a bar plot for sentiment scores corrected as per the following order and put into two separate colors:
(NEGATIVE) anger, disgust, fear, sadness, negative --- (POSITIVE) anticipation, joy, surprise, trust, positive.
Below is the code which only gives a decreasing plot.
barplot(sort(colSums(s), decreasing = TRUE),
las = 2,
col = rainbow(2),
ylab = 'Count',
main = 'User Synergies')
> dput(head(s))
structure(list(anger = c(1, 0, 0, 0, 0, 0), anticipation = c(0,
0, 5, 0, 0, 0), disgust = c(0, 0, 0, 0, 0, 0), fear = c(1, 0,
2, 1, 0, 0), joy = c(1, 0, 1, 0, 0, 0), sadness = c(1, 0, 2,
1, 0, 0), surprise = c(0, 0, 2, 1, 0, 0), trust = c(4, 2, 3,
1, 0, 1), negative = c(2, 0, 3, 2, 1, 1), positive = c(4, 4,
7, 1, 0, 2)), row.names = c(NA, 6L), class = "data.frame")
Another way:
positive <- c("anticipation", "joy", "surprise", "trust", "positive")
negative <- c("anger", "disgust", "fear", "sadness", "negative")
barplot(colSums(s[,c(negative, positive)]),
las = 2,
col = c(rep("red", length(negative)), rep("cyan", length(positive))),
ylab = 'Count', ylim = c(0, 20),
main = 'User Synergies')
The result:
Try this ,
df <- structure(list(anger = c(1, 0, 0, 0, 0, 0),
anticipation = c(0, 0, 5, 0, 0, 0),
disgust = c(0, 0, 0, 0, 0, 0),
fear = c(1, 0,2, 1, 0, 0),
joy = c(1, 0, 1, 0, 0, 0),
sadness = c(1, 0, 2, 1, 0, 0),
surprise = c(0, 0, 2, 1, 0, 0),
trust = c(4, 2, 3, 1, 0, 1),
negative = c(2, 0, 3, 2, 1, 1),
positive = c(4, 4,7, 1, 0, 2)),
row.names = c(NA, 6L), class = "data.frame")
pn <- rainbow(2) # "#FF0000" "#00FFFF" one for positive and the other for negative
s <- sort(colSums(df) , decreasing = TRUE)
names(s)
#> [1] "positive" "trust" "negative" "anticipation" "fear"
#> [6] "sadness" "surprise" "joy" "anger" "disgust"
# arrange colors based on names of sorted columns
col <- c(pn[1] , pn[1] , pn[2] , pn[1] , pn[2] ,
pn[2] , pn[1] , pn[1] , pn[2] , pn[2])
barplot(s ,
las = 2,
col = col,
ylab = 'Count',
main = 'User Synergies')
Created on 2022-05-31 by the reprex package (v2.0.1)
You may try
library(dplyr)
library(reshape2)
df <- data.frame(
anger = 200,
disgust = 100,
fear = 900,
sadness = 400,
negative = 1500,
anticipation = 2000,
joy = 1200,
surprise = 300,
trust = 2500,
positive = 5000
)
pall <- c("red", "blue")
colSums(df) %>%
melt %>%
tibble::rownames_to_column(., "sentiments") %>%
mutate(sentiments = factor(sentiments, levels = c("anger", "disgust", "fear", "sadness", "negative", "anticipation", "joy", "surprise", "trust", "positive"))) %>%
mutate(colo = ifelse(sentiments %in% c("anger", "disgust", "fear", "sadness", "negative"), 0, 1) %>% as.factor) %>%
barplot(data = ., value ~ sentiments, col = pall[.$colo], las = 2, xlab = "")
Another approach :
df <- structure(list(anger = c(1, 0, 0, 0, 0, 0),
anticipation = c(0, 0, 5, 0, 0, 0),
disgust = c(0, 0, 0, 0, 0, 0),
fear = c(1, 0,2, 1, 0, 0),
joy = c(1, 0, 1, 0, 0, 0),
sadness = c(1, 0, 2, 1, 0, 0),
surprise = c(0, 0, 2, 1, 0, 0),
trust = c(4, 2, 3, 1, 0, 1),
negative = c(2, 0, 3, 2, 1, 1),
positive = c(4, 4,7, 1, 0, 2)),
row.names = c(NA, 6L), class = "data.frame")
s <- sort(colSums(df) , decreasing = TRUE)
pos <- c("positive" , "trust" , "anticipation" ,
"surprise" , "joy")
col <- names(s)
col <- ifelse(col %in% pos , "cyan" , "red")
barplot(s ,
las = 2,
col = col,
ylab = 'Count',
main = 'User Synergies')
Created on 2022-05-31 by the reprex package (v2.0.1)
It's my first time using R. I want to create a scatterplot with a line of best fit for a decade of data about all countries. I joined two excel datasets - one has the number of people jailed for a certain crime by country in a given year (rows: country, columns year:, the other has average income for a certain population group (rows: country, columns: year).
dataclean=inner_join(EnforcementData, IncomeData, by = "Country")
This gives me a dataset with x, y points where enforcement is the x and income is the y
I want to plot this and find the outliers - so those countries where enforcement is out of step with income. I tried:
ggplot(dataclean, aes(x=EnforcementData, y=IncomeData, group= "Country")) +
geom_line(aes(color = "Country")
Thanks for any suggestions!
EDIT: I think I've improperly merged the datasets somehow, as it returns a matrix. Like this:
dput(head(dataclean))
structure(list(Country = c("Albania", "Algeria", "Angola", "Antigua and Barbuda",
"Argentina", "Armenia"), 2006.x = c(0, 0, 0, 0, 0, 0), 2007.x = c(0,
0, 0, 0, 0, 0), 2008.x = c(0, 0, 0, 0, 3, 0), 2009.x = c(0,
0, 0, 0, 2, 0), 2010.x = c(0, 0, 3, 0, 0, 0), 2011.x = c(0,
0, 0, 0, 4, 0), 2012.x = c(0, 0, 0, 0, 2, 0), 2013.x = c(1,
1, 3, 0, 3, 0), 2014.x = c(0, 0, 0, 0, 1, 0), 2015.x = c(0,
0, 1, 1, 0, 0), 2016.x = c(0, 0, 5, 1, 5, 0), 2017.x = c(0,
0, 3, 0, 0, 0), 2018.x = c(0, 0, 0, 0, 0, 0), 2019.x = c(0,
1, 3, 0, 0, 0), 2020.x = c(0, 1, 0, 0, 0, 0), 2006.y = c(3.273755,
2.9912451, 3.689971, 1.342365, 2.8111637, 3.1407325), 2007.y = c(3.157699,
3.0298389, 3.759603, 1.315153, 2.8102016, 3.2122944), 2008.y = c(3.0636166,
3.0644794, 3.754531, 1.181255, 2.9054865, 3.1780076), 2009.y = c(3.0084051,
3.0477934, 3.874565, 1.144331, 2.9149061, 3.0896677), 2010.y = c(2.9951254,
2.9948973, 3.796005, 1.161454, 2.8314702, 3.1664003), 2011.y = c(3.1528966,
3.0144704, 3.814187, 1.190574, 2.8360401, 3.1267727), 2012.y = c(3.1964009,
2.9731618, 3.73838, 1.201921, 2.913096, 3.0577149), 2013.y = c(3.1683419,
2.943247, 3.779373, 1.209151, 2.9020493, 3.0017037), 2014.y = c(3.0180735,
3.0699088, 3.913854, 1.8298544, 3.0114942, 2.9938708), 2015.y = c(2.9489451,
3.1155215, 3.864924, 1.7799824, 3.0169873, 3.0037498), 2016.y = c(2.8750588,
3.1476701, 3.909438, 1.7761061, 2.7538409, 3.041738), 2017.y = c(2.8906318,
3.0717401, 3.880863, 2.2256225, 2.7280908, 3.0332232), 2018.y = c(2.9485421,
3.12678, 3.609102, 2.1923678, 2.5386973, 2.8175096), 2019.y = c(3.0029988,
3.0910585, 3.524361, 2.1915031, 2.5461976, 2.6481938), 2020.y = c(1.9297139,
3.1117555, 3.3970031, 2.1946293, 2.5862916, 2.438313)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
I've created a decision tree using rpart and the code below:
res.tree <- rpart(myformula, data = credit_train)
my data has been subset into 2 parts. The training part at 70% and a testing part at 30%.
This part works well and my tree is created. Where I'm getting stuck is with the prediction so that I can calculate my confusion matrix and ROC curves.
I'm using this code tree_pred = predict(res.tree, credit_train, type = "class")
but I get this message:
Error in predict.rpart(res.tree, credit_test, type = "class") : Invalid prediction for "rpart" object
In addition:
Warning message:
'newdata' had 271 rows but variables found have 729 rows
I can't figure out if I don't have a library loaded or what is causing the it not to recognize the type, which is what so many resources say I need to use and why I'm getting a mismatch in the rows.
The 'newdata' at 271 rows is what my testing data set has and my training data-set has 729 rows.
Is the decision tree creation causing my problem or could it be the prediction code?
Responding to comments:
I'm using the following libraries:
library(readxl)
library(dplyr)
library(factoextra)
library(corrplot)
library(rpart)
library(rpart.plot)
library(RColorBrewer)
library(pROC)
library(Hmisc)
library(fBasics)
library(rattle)
library(caret)
A sample of my data:
structure(list(CHK_ACCT = c(0, 1, 0, 0), DURATION = c(6, 48,
42, 24), HISTORY = c(4, 2, 2, 3), NEW_CAR = c(0, 0, 0, 1), USED_CAR = c(0,
0, 0, 0), FURNITURE = c(0, 0, 1, 0), `RADIO/TV` = c(1, 1, 0,
0), EDUCATION = c(0, 0, 0, 0), RETRAINING = c(0, 0, 0, 0), AMOUNT = c(1169,
5951, 7882, 4870), SAV_ACCT = c(4, 0, 0, 0), EMPLOYMENT = c(4,
2, 3, 2), INSTALL_RATE = c(4, 2, 2, 3), MALE_DIV = c(0, 0, 0,
0), MALE_SINGLE = c(1, 0, 1, 1), MALE_MAR_or_WID = c(0, 0, 0,
0), `CO-APPLICANT` = c(0, 0, 0, 0), GUARANTOR = c(0, 0, 1, 0),
PRESENT_RESIDENT = c(4, 2, 4, 4), REAL_ESTATE = c(1, 1, 0,
0), PROP_UNKN_NONE = c(0, 0, 0, 1), AGE = c(67, 22, 45, 53
), OTHER_INSTALL = c(0, 0, 0, 0), RENT = c(0, 0, 0, 0), OWN_RES = c(1,
1, 0, 0), NUM_CREDITS = c(2, 1, 1, 2), JOB = c(2, 2, 2, 2
), NUM_DEPENDENTS = c(1, 1, 2, 2), TELEPHONE = c(1, 0, 0,
0), FOREIGN = c(0, 0, 0, 0), DEFAULT = c(0, 1, 0, 1), CHK_ACCT_rec = c(1,
2, 1, 1), SAV_ACCT_rec = c(0, 1, 1, 1)), .Names = c("CHK_ACCT",
"DURATION", "HISTORY", "NEW_CAR", "USED_CAR", "FURNITURE", "RADIO/TV",
"EDUCATION", "RETRAINING", "AMOUNT", "SAV_ACCT", "EMPLOYMENT",
"INSTALL_RATE", "MALE_DIV", "MALE_SINGLE", "MALE_MAR_or_WID",
"CO-APPLICANT", "GUARANTOR", "PRESENT_RESIDENT", "REAL_ESTATE",
"PROP_UNKN_NONE", "AGE", "OTHER_INSTALL", "RENT", "OWN_RES",
"NUM_CREDITS", "JOB", "NUM_DEPENDENTS", "TELEPHONE", "FOREIGN",
"DEFAULT", "CHK_ACCT_rec", "SAV_ACCT_rec"), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
myformula = credit_train$DEFAULT ~ credit_train$CHK_ACCT_rec +
credit_train$DURATION + credit_train$HISTORY + credit_train$NEW_CAR +
credit_train$USED_CAR + credit_train$FURNITURE + credit_train$`RADIO/TV` +
credit_train$EDUCATION + credit_train$RETRAINING + credit_train$AMOUNT +
credit_train$SAV_ACCT_rec + credit_train$EMPLOYMENT +
credit_train$INSTALL_RATE + credit_train$MALE_DIV + credit_train$MALE_SINGLE
+ credit_train$MALE_MAR_or_WID + credit_train$`CO-APPLICANT` +
credit_train$GUARANTOR + credit_train$PRESENT_RESIDENT +
credit_train$REAL_ESTATE + credit_train$PROP_UNKN_NONE + credit_train$AGE +
credit_train$OTHER_INSTALL + credit_train$RENT + credit_train$OWN_RES +
credit_train$NUM_CREDITS + credit_train$JOB + credit_train$NUM_DEPENDENTS +
credit_train$TELEPHONE + credit_train$FOREIGN
#calimo I hope this is what you needed.
I have a list (lst3, subset below) and would like to do some calculations on it, e.g.:
lst4 <-lapply(lst3, function(x) aggregate(x[,5:ncol(x)], x[c(4)], FUN = mean)) #column means
lst5<-lapply(lst4,function(x) apply(x[,-c(1)],1,mean)) # get row mean
However, I am unable to get row mean without ignoring "Site".
I would like my final list to look like this:
lst5<-
[[1]]
Site x
G116 1.864233
[[2]]
Site x
GG16 2.064567
The essence is that the final list should have the above structure so that I can write my data to working directory using:
lapply(lst5,function(x)write.table(x,file=paste(getwd(),"summer",paste0(unique(x$Site),".csv"),
sep="/"),row.names=FALSE,quote=FALSE)) ### create a folder called "summer" and write files to directory###
Thanks,
AZ.
list(structure(list(Year = c(2005L, 2005L, 2005L), Month = c(8L,
8L, 8L), Day = 29:31, Site = structure(c(1L, 1L, 1L), .Label = "G116", class = "factor"),
Sim001 = c(8.4, 17.72, 6.03), Sim002 = c(0.27, 0, 0), Sim003 = c(2.83,
0.14, 0.1), Sim004 = c(0, 0, 0), Sim005 = c(0, 0.77, 0.28
), Sim006 = c(0, 0, 0), Sim007 = c(0, 0, 0), Sim008 = c(10.94,
4.77, 0), Sim009 = c(0, 0, 0), Sim010 = c(3.43, 2.74, 0.65
), Sim011 = c(0.36, 0, 2.75), Sim012 = c(26.91, 0, 2.16),
Sim013 = c(0.88, 1.33, 0.87), Sim014 = c(0, 0.86, 9.42),
Sim015 = c(0, 0.17, 1.15), Sim016 = c(0, 0, 0), Sim017 = c(0.13,
0, 0), Sim018 = c(0, 0, 6.72), Sim019 = c(8.45, 12.99, 23.72
), Sim020 = c(1.76, 0, 0), Sim021 = c(0, 0, 2.34), Sim022 = c(0,
0, 0), Sim023 = c(1.2, 0, 0.26), Sim024 = c(0.85, 0, 0),
Sim025 = c(0, 0, 0), Sim026 = c(2.05, 0.76, 5.03), Sim027 = c(0.78,
0, 0), Sim028 = c(1.2, 0, 0), Sim029 = c(22, 0.19, 0), Sim030 = c(0.12,
0, 0), Sim031 = c(3.1, 13.67, 0), Sim032 = c(0, 0, 17.88),
Sim033 = c(0, 0, 0), Sim034 = c(1.11, 0, 0), Sim035 = c(1.17,
1.41, 23.35), Sim036 = c(0, 0.48, 1.71), Sim037 = c(1.51,
11.1, 7.98), Sim038 = c(0, 0, 0), Sim039 = c(0, 0, 5.46),
Sim040 = c(5.21, 0, 0), Sim041 = c(0.1, 0.11, 0), Sim042 = c(0,
0.15, 5.23), Sim043 = c(0, 0, 0), Sim044 = c(0, 0.1, 0),
Sim045 = c(0, 0, 0), Sim046 = c(0, 0, 0), Sim047 = c(0, 0,
0.11), Sim048 = c(0, 0, 0), Sim049 = c(0, 0, 4.05), Sim050 = c(0,
0, 0), Sim051 = c(0, 0.12, 0), Sim052 = c(0.24, 2.58, 0),
Sim053 = c(3.63, 0, 0.17), Sim054 = c(10.94, 2.69, 0), Sim055 = c(0,
0, 0), Sim056 = c(0.24, 0.44, 8.27), Sim057 = c(0, 0, 0),
Sim058 = c(0, 0, 3.75), Sim059 = c(0.19, 11.06, 0), Sim060 = c(0,
0, 1.65), Sim061 = c(0, 4.95, 0), Sim062 = c(0.15, 0, 4.73
), Sim063 = c(2.99, 0.12, 1.28), Sim064 = c(0, 0, 0), Sim065 = c(0,
0, 0), Sim066 = c(0, 0, 0), Sim067 = c(0.11, 0.62, 0.56),
Sim068 = c(2.84, 0, 0), Sim069 = c(0, 0, 0), Sim070 = c(17.91,
0.11, 4.78), Sim071 = c(0, 0, 1.68), Sim072 = c(0, 0, 1.38
), Sim073 = c(1.68, 0, 0), Sim074 = c(0.53, 0, 2.87), Sim075 = c(0,
0, 0), Sim076 = c(2.58, 0.27, 0.11), Sim077 = c(0, 0, 0),
Sim078 = c(9.07, 3.13, 8.62), Sim079 = c(0.98, 0, 2.38),
Sim080 = c(3.4, 0, 0), Sim081 = c(0, 0, 4.57), Sim082 = c(1.87,
2.86, 0), Sim083 = c(21.76, 2.24, 0), Sim084 = c(0.45, 4.03,
0.39), Sim085 = c(0, 0, 0), Sim086 = c(0, 0, 0), Sim087 = c(0,
0, 17.12), Sim088 = c(5.05, 0, 0), Sim089 = c(0, 0, 1.4),
Sim090 = c(0.1, 0, 0), Sim091 = c(1.96, 0, 1.38), Sim092 = c(0,
0, 0), Sim093 = c(0, 0, 0), Sim094 = c(0, 0, 1.81), Sim095 = c(2.72,
7.16, 1.7), Sim096 = c(6.37, 0, 0), Sim097 = c(0, 1.12, 25.7
), Sim098 = c(0, 0, 0), Sim099 = c(0, 0, 0), Sim100 = c(6.77,
10.87, 2.6)), .Names = c("Year", "Month", "Day", "Site",
"Sim001", "Sim002", "Sim003", "Sim004", "Sim005", "Sim006", "Sim007",
"Sim008", "Sim009", "Sim010", "Sim011", "Sim012", "Sim013", "Sim014",
"Sim015", "Sim016", "Sim017", "Sim018", "Sim019", "Sim020", "Sim021",
"Sim022", "Sim023", "Sim024", "Sim025", "Sim026", "Sim027", "Sim028",
"Sim029", "Sim030", "Sim031", "Sim032", "Sim033", "Sim034", "Sim035",
"Sim036", "Sim037", "Sim038", "Sim039", "Sim040", "Sim041", "Sim042",
"Sim043", "Sim044", "Sim045", "Sim046", "Sim047", "Sim048", "Sim049",
"Sim050", "Sim051", "Sim052", "Sim053", "Sim054", "Sim055", "Sim056",
"Sim057", "Sim058", "Sim059", "Sim060", "Sim061", "Sim062", "Sim063",
"Sim064", "Sim065", "Sim066", "Sim067", "Sim068", "Sim069", "Sim070",
"Sim071", "Sim072", "Sim073", "Sim074", "Sim075", "Sim076", "Sim077",
"Sim078", "Sim079", "Sim080", "Sim081", "Sim082", "Sim083", "Sim084",
"Sim085", "Sim086", "Sim087", "Sim088", "Sim089", "Sim090", "Sim091",
"Sim092", "Sim093", "Sim094", "Sim095", "Sim096", "Sim097", "Sim098",
"Sim099", "Sim100"), row.names = 15947:15949, class = "data.frame"),
structure(list(Year = c(2005L, 2005L, 2005L), Month = c(8L,
8L, 8L), Day = 29:31, Site = structure(c(1L, 1L, 1L), .Label = "GG16", class = "factor"),
Sim001 = c(18.36, 0.33, 0.14), Sim002 = c(0, 10.92, 0
), Sim003 = c(0, 0, 0), Sim004 = c(0, 0, 1.7), Sim005 = c(0,
0, 0), Sim006 = c(0.91, 4.24, 0), Sim007 = c(0, 0, 0.22
), Sim008 = c(0.63, 2.9, 2.24), Sim009 = c(0, 0, 0),
Sim010 = c(0, 0, 6.91), Sim011 = c(0, 3.28, 10.18), Sim012 = c(8.39,
14.58, 45.62), Sim013 = c(2.87, 0.53, 0.11), Sim014 = c(9.15,
21.1, 0.66), Sim015 = c(0, 1.75, 2.2), Sim016 = c(0,
7.86, 0), Sim017 = c(0, 0, 0), Sim018 = c(0, 0, 0), Sim019 = c(0,
0, 0), Sim020 = c(0.39, 0, 0), Sim021 = c(0.13, 0, 1.05
), Sim022 = c(0, 0, 10.91), Sim023 = c(0.23, 0, 0), Sim024 = c(0.12,
0.83, 5.35), Sim025 = c(0, 0, 0), Sim026 = c(7.75, 0,
4.82), Sim027 = c(20.04, 0, 0), Sim028 = c(12.41, 0,
5.3), Sim029 = c(0, 0, 0), Sim030 = c(0, 0, 0), Sim031 = c(0,
8.06, 0), Sim032 = c(0, 0, 0), Sim033 = c(0, 0, 0), Sim034 = c(0.1,
0, 3.34), Sim035 = c(0, 4.34, 3.53), Sim036 = c(2.89,
0.27, 0), Sim037 = c(0, 0, 0), Sim038 = c(0, 0, 0), Sim039 = c(0,
0.11, 0), Sim040 = c(9.83, 1.55, 9.09), Sim041 = c(3.6,
0, 0), Sim042 = c(0, 0, 1.37), Sim043 = c(0, 0, 0), Sim044 = c(0,
0, 0), Sim045 = c(0, 0, 0), Sim046 = c(0, 0, 0), Sim047 = c(0,
20.52, 0.65), Sim048 = c(1.77, 0.67, 0), Sim049 = c(0,
0, 0), Sim050 = c(0, 0, 0), Sim051 = c(0, 4.9, 0), Sim052 = c(0.71,
11.34, 0), Sim053 = c(3.46, 2.59, 1.5), Sim054 = c(0,
23.63, 0), Sim055 = c(0, 16.48, 4.99), Sim056 = c(0,
0, 0), Sim057 = c(0, 0, 0), Sim058 = c(0, 0, 0), Sim059 = c(0,
0, 0), Sim060 = c(16.87, 0, 0), Sim061 = c(0, 3.43, 0
), Sim062 = c(0.45, 0, 0), Sim063 = c(0, 11.14, 7.22),
Sim064 = c(0, 0, 0), Sim065 = c(0, 0, 0), Sim066 = c(0,
16.08, 1.87), Sim067 = c(0, 0, 0), Sim068 = c(5.16, 0.88,
0.1), Sim069 = c(0, 0, 3.91), Sim070 = c(0, 0, 0), Sim071 = c(0.17,
0, 5.22), Sim072 = c(0, 0, 6.95), Sim073 = c(0, 0, 0),
Sim074 = c(0.14, 0, 0), Sim075 = c(0, 0, 0), Sim076 = c(0,
9.62, 0), Sim077 = c(0, 0, 0), Sim078 = c(1.65, 0, 0),
Sim079 = c(0.23, 8.41, 0.28), Sim080 = c(0.78, 0, 0),
Sim081 = c(0, 0, 0), Sim082 = c(0.11, 2.75, 0), Sim083 = c(0.26,
7.34, 5.92), Sim084 = c(0, 0, 4.27), Sim085 = c(0, 0,
0), Sim086 = c(0, 0, 0.1), Sim087 = c(27.18, 0.72, 28.29
), Sim088 = c(0, 0, 4.2), Sim089 = c(0, 9.37, 6.59),
Sim090 = c(0.21, 2.57, 0), Sim091 = c(0.45, 0, 0), Sim092 = c(0,
4.97, 0), Sim093 = c(1.43, 0, 0), Sim094 = c(0, 0, 2.15
), Sim095 = c(6, 0, 1.63), Sim096 = c(7.21, 0, 0), Sim097 = c(0,
0.39, 1.92), Sim098 = c(0, 0, 0), Sim099 = c(4.38, 0,
0), Sim100 = c(0, 0, 0)), .Names = c("Year", "Month",
"Day", "Site", "Sim001", "Sim002", "Sim003", "Sim004", "Sim005",
"Sim006", "Sim007", "Sim008", "Sim009", "Sim010", "Sim011",
"Sim012", "Sim013", "Sim014", "Sim015", "Sim016", "Sim017",
"Sim018", "Sim019", "Sim020", "Sim021", "Sim022", "Sim023",
"Sim024", "Sim025", "Sim026", "Sim027", "Sim028", "Sim029",
"Sim030", "Sim031", "Sim032", "Sim033", "Sim034", "Sim035",
"Sim036", "Sim037", "Sim038", "Sim039", "Sim040", "Sim041",
"Sim042", "Sim043", "Sim044", "Sim045", "Sim046", "Sim047",
"Sim048", "Sim049", "Sim050", "Sim051", "Sim052", "Sim053",
"Sim054", "Sim055", "Sim056", "Sim057", "Sim058", "Sim059",
"Sim060", "Sim061", "Sim062", "Sim063", "Sim064", "Sim065",
"Sim066", "Sim067", "Sim068", "Sim069", "Sim070", "Sim071",
"Sim072", "Sim073", "Sim074", "Sim075", "Sim076", "Sim077",
"Sim078", "Sim079", "Sim080", "Sim081", "Sim082", "Sim083",
"Sim084", "Sim085", "Sim086", "Sim087", "Sim088", "Sim089",
"Sim090", "Sim091", "Sim092", "Sim093", "Sim094", "Sim095",
"Sim096", "Sim097", "Sim098", "Sim099", "Sim100"), row.names = 15947:15949, class = "data.frame"))
You can go from lst3 directly to lst5 without the intermediate aggregate step:
lapply(lst3, function(df){
data.frame(Site = df$Site[1], x = mean(unlist(df[-c(1:4)])))
})
#[[1]]
# Site x
#1 G116 1.864233
#
#[[2]]
# Site x
#1 GG16 2.064567
Since you're calculating the mean of all columns except the first 4 columns and over all the rows of the other columns, it's quite easy to unlist the data, creating a single vector, and then using standard mean on it. Also, by skipping the lst4 step, this most likely be noticeably faster.
Or, as commented by Richard, a variation could be:
lapply(lst3, function(df){
data.frame(Site = df$Site[1], x = mean(colMeans(df[-c(1:4)])))
})
Benchmark:
library(microbenchmark)
microbenchmark(
f1 = {lapply(lst3, function(df){
data.frame(Site = df$Site[1], x = mean(unlist(df[-c(1:4)])))
})},
f2 = {lapply(lst3, function(df){
data.frame(Site = df$Site[1], x = mean(colMeans(df[-c(1:4)])))
})},
unit = "relative"
)
Unit: relative
expr min lq median uq max neval
f1 1.00000 1.000000 1.000000 1.000000 1.000000 100
f2 2.91545 2.937272 2.927799 2.894704 3.486007 100
Here's another option for your consideration:
library(reshape2)
x <- melt(lst3)
aggregate(value ~ Site, x[grepl("^Sim.*", x$variable),], FUN = mean)
# Site value
#1 G116 1.864233
#2 GG16 2.064567
Or the same concept but using dplyr:
library(dplyr)
filter(x, grepl("^Sim.*", variable)) %>% group_by(Site) %>% summarise(x = mean(value))
#Source: local data frame [2 x 2]
#
# Site x
#1 G116 1.864233
#2 GG16 2.064567
Of course, this could also be done using data.table, for example like this (there are probably several even slightly more efficient ways to do this in data.table):
library(data.table)
setDT(x)[grepl("^Sim.*", variable), list(x = mean(value)), by = Site]
# Site x
#1: G116 1.864233
#2: GG16 2.064567