Decision Tree Keeps Using Y-variable in Tree Decision Making - r
I'm using C5.0 to make a decision tree, and it's using my class label in the tree. A snippet of my data is below.
trainX
V1 V2 V3 V4 V5 V6
1 39 State-gov 77516 Bachelors 13 Never-married
2 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse
3 38 Private 215646 HS-grad 9 Divorced
4 53 Private 234721 11th 7 Married-civ-spouse
5 28 Private 338409 Bachelors 13 Married-civ-spouse
V7 V8 V9 V10 V11 V12 V13 V14
1 Adm-clerical Not-in-family White Male 2174 0 40 United-States
2 Exec-managerial Husband White Male 0 0 13 United-States
3 Handlers-cleaners Not-in-family White Male 0 0 40 United-States
4 Handlers-cleaners Husband Black Male 0 0 40 United-States
5 Prof-specialty Wife Black Female 0 0 40 Cuba
trainY
[1] <=50K <=50K <=50K <=50K <=50K
There are cases in my data of >50K as well, this snippet of 5 just did not contain any.
When I make my tree, this is the code I use
library(C50)
trainX = X[1:100,]
trainY = Y[1:100]
testX = X[101:150,]
testY = Y[101:150]
model = C5.0(trainX, trainY)
summary(model)
And the output I get is...
Decision tree:
<=50K (100/25)
Evaluation on training data (100 cases):
Decision Tree
----------------
Size Errors
1 25(25.0%) <<
(a) (b) <-classified as
---- ----
75 (a): class <=50K
25 (b): class >50K
What am I doing wrong that it's using the classification as part of the tree?
EDIT - DPUTS below of Head. Still gives me the same issue, where its making a Decision Tree using the split as <=50K or >50K, which is my "Y" output and thus shouldn't be part of the decision making process.
trainX
structure(list(V1 = c(39L, 50L, 38L, 53L, 28L, 37L), V2 = structure(c(8L,
7L, 5L, 5L, 5L, 5L), .Label = c(" ?", " Federal-gov", " Local-gov",
" Never-worked", " Private", " Self-emp-inc", " Self-emp-not-inc",
" State-gov", " Without-pay"), class = "factor"), V3 = c(77516L,
83311L, 215646L, 234721L, 338409L, 284582L), V4 = structure(c(10L,
10L, 12L, 2L, 10L, 13L), .Label = c(" 10th", " 11th", " 12th",
" 1st-4th", " 5th-6th", " 7th-8th", " 9th", " Assoc-acdm", " Assoc-voc",
" Bachelors", " Doctorate", " HS-grad", " Masters", " Preschool",
" Prof-school", " Some-college"), class = "factor"), V5 = c(13L,
13L, 9L, 7L, 13L, 14L), V6 = structure(c(5L, 3L, 1L, 3L, 3L,
3L), .Label = c(" Divorced", " Married-AF-spouse", " Married-civ-spouse",
" Married-spouse-absent", " Never-married", " Separated", " Widowed"
), class = "factor"), V7 = structure(c(2L, 5L, 7L, 7L, 11L, 5L
), .Label = c(" ?", " Adm-clerical", " Armed-Forces", " Craft-repair",
" Exec-managerial", " Farming-fishing", " Handlers-cleaners",
" Machine-op-inspct", " Other-service", " Priv-house-serv", " Prof-specialty",
" Protective-serv", " Sales", " Tech-support", " Transport-moving"
), class = "factor"), V8 = structure(c(2L, 1L, 2L, 1L, 6L, 6L
), .Label = c(" Husband", " Not-in-family", " Other-relative",
" Own-child", " Unmarried", " Wife"), class = "factor"), V9 = structure(c(5L,
5L, 5L, 3L, 3L, 5L), .Label = c(" Amer-Indian-Eskimo", " Asian-Pac-Islander",
" Black", " Other", " White"), class = "factor"), V10 = structure(c(2L,
2L, 2L, 2L, 1L, 1L), .Label = c(" Female", " Male"), class = "factor"),
V11 = c(2174L, 0L, 0L, 0L, 0L, 0L), V12 = c(0L, 0L, 0L, 0L,
0L, 0L), V13 = c(40L, 13L, 40L, 40L, 40L, 40L), V14 = structure(c(40L,
40L, 40L, 40L, 6L, 40L), .Label = c(" ?", " Cambodia", " Canada",
" China", " Columbia", " Cuba", " Dominican-Republic", " Ecuador",
" El-Salvador", " England", " France", " Germany", " Greece",
" Guatemala", " Haiti", " Holand-Netherlands", " Honduras",
" Hong", " Hungary", " India", " Iran", " Ireland", " Italy",
" Jamaica", " Japan", " Laos", " Mexico", " Nicaragua", " Outlying-US(Guam-USVI-etc)",
" Peru", " Philippines", " Poland", " Portugal", " Puerto-Rico",
" Scotland", " South", " Taiwan", " Thailand", " Trinadad&Tobago",
" United-States", " Vietnam", " Yugoslavia"), class = "factor")), .Names = c("V1",
"V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11",
"V12", "V13", "V14"), row.names = c(NA, 6L), class = "data.frame")
trainY
structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c(" <=50K", " >50K"
), class = "factor")
After reading in trainX, trainY, the easiest way to reproduce this problem would be
library(C50)
test = C5.0(x=trainX, y=trainY)
My actual train Y :
structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L,
1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 2L, 1L, 2L, 2L, 1L, 1L), .Label = c(" <=50K", " >50K"), class = "factor")
My actual trainX
structure(list(age = c(39L, 50L, 38L, 53L, 28L, 37L, 49L, 52L,
31L, 42L, 37L, 30L, 23L, 32L, 40L, 34L, 25L, 32L, 38L, 43L, 40L,
54L, 35L, 43L, 59L, 56L, 19L, 54L, 39L, 49L, 23L, 20L, 45L, 30L,
22L, 48L, 21L, 19L, 31L, 48L, 31L, 53L, 24L, 49L, 25L, 57L, 53L,
44L, 41L, 29L, 25L, 18L, 47L, 50L, 47L, 43L, 46L, 35L, 41L, 30L,
30L, 32L, 48L, 42L, 29L, 36L, 28L, 53L, 49L, 25L, 19L, 31L, 29L,
23L, 79L, 27L, 40L, 67L, 18L, 31L, 18L, 52L, 46L, 59L, 44L, 53L,
49L, 33L, 30L, 43L, 57L, 37L, 28L, 30L, 34L, 29L, 48L, 37L, 48L,
32L), workClass = structure(c(8L, 7L, 5L, 5L, 5L, 5L, 5L, 7L,
5L, 5L, 5L, 8L, 5L, 5L, 5L, 5L, 7L, 5L, 5L, 7L, 5L, 5L, 2L, 5L,
5L, 3L, 5L, 1L, 5L, 5L, 3L, 5L, 5L, 2L, 8L, 5L, 5L, 5L, 5L, 7L,
5L, 7L, 5L, 5L, 5L, 2L, 5L, 5L, 8L, 5L, 5L, 5L, 5L, 2L, 6L, 5L,
5L, 5L, 5L, 5L, 5L, 1L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 1L, 5L, 5L,
7L, 5L, 5L, 5L, 5L, 1L, 5L, 3L, 5L, 5L, 5L, 5L, 5L, 5L, 3L, 5L,
5L, 2L, 5L, 5L, 5L, 5L, 3L, 3L, 7L, 5L, 5L, 2L), .Label = c(" ?",
" Federal-gov", " Local-gov", " Never-worked", " Private", " Self-emp-inc",
" Self-emp-not-inc", " State-gov", " Without-pay"), class = "factor"),
fnlwgt = c(77516L, 83311L, 215646L, 234721L, 338409L, 284582L,
160187L, 209642L, 45781L, 159449L, 280464L, 141297L, 122272L,
205019L, 121772L, 245487L, 176756L, 186824L, 28887L, 292175L,
193524L, 302146L, 76845L, 117037L, 109015L, 216851L, 168294L,
180211L, 367260L, 193366L, 190709L, 266015L, 386940L, 59951L,
311512L, 242406L, 197200L, 544091L, 84154L, 265477L, 507875L,
88506L, 172987L, 94638L, 289980L, 337895L, 144361L, 128354L,
101603L, 271466L, 32275L, 226956L, 51835L, 251585L, 109832L,
237993L, 216666L, 56352L, 147372L, 188146L, 59496L, 293936L,
149640L, 116632L, 105598L, 155537L, 183175L, 169846L, 191681L,
200681L, 101509L, 309974L, 162298L, 211678L, 124744L, 213921L,
32214L, 212759L, 309634L, 125927L, 446839L, 276515L, 51618L,
159937L, 343591L, 346253L, 268234L, 202051L, 54334L, 410867L,
249977L, 286730L, 212563L, 117747L, 226296L, 115585L, 191277L,
202683L, 171095L, 249409L), education = structure(c(10L,
10L, 12L, 2L, 10L, 13L, 7L, 12L, 13L, 10L, 16L, 10L, 10L,
8L, 9L, 6L, 12L, 12L, 2L, 13L, 11L, 12L, 7L, 2L, 12L, 10L,
12L, 16L, 12L, 12L, 8L, 16L, 10L, 16L, 16L, 2L, 16L, 12L,
16L, 8L, 7L, 10L, 10L, 12L, 12L, 10L, 12L, 13L, 9L, 9L, 16L,
12L, 15L, 10L, 12L, 16L, 5L, 9L, 12L, 12L, 10L, 6L, 12L,
11L, 16L, 12L, 16L, 12L, 16L, 16L, 16L, 10L, 10L, 16L, 16L,
12L, 8L, 1L, 2L, 6L, 12L, 10L, 12L, 12L, 12L, 12L, 12L, 13L,
7L, 11L, 9L, 16L, 16L, 12L, 10L, 16L, 11L, 16L, 8L, 12L), .Label = c(" 10th",
" 11th", " 12th", " 1st-4th", " 5th-6th", " 7th-8th", " 9th",
" Assoc-acdm", " Assoc-voc", " Bachelors", " Doctorate",
" HS-grad", " Masters", " Preschool", " Prof-school", " Some-college"
), class = "factor"), educationNum = c(13L, 13L, 9L, 7L,
13L, 14L, 5L, 9L, 14L, 13L, 10L, 13L, 13L, 12L, 11L, 4L,
9L, 9L, 7L, 14L, 16L, 9L, 5L, 7L, 9L, 13L, 9L, 10L, 9L, 9L,
12L, 10L, 13L, 10L, 10L, 7L, 10L, 9L, 10L, 12L, 5L, 13L,
13L, 9L, 9L, 13L, 9L, 14L, 11L, 11L, 10L, 9L, 15L, 13L, 9L,
10L, 3L, 11L, 9L, 9L, 13L, 4L, 9L, 16L, 10L, 9L, 10L, 9L,
10L, 10L, 10L, 13L, 13L, 10L, 10L, 9L, 12L, 6L, 7L, 4L, 9L,
13L, 9L, 9L, 9L, 9L, 9L, 14L, 5L, 16L, 11L, 10L, 10L, 9L,
13L, 10L, 16L, 10L, 12L, 9L), marital = structure(c(5L, 3L,
1L, 3L, 3L, 3L, 4L, 3L, 5L, 3L, 3L, 3L, 5L, 5L, 3L, 3L, 5L,
5L, 3L, 1L, 3L, 6L, 3L, 3L, 1L, 3L, 5L, 3L, 1L, 3L, 5L, 5L,
1L, 3L, 3L, 5L, 5L, 2L, 3L, 3L, 3L, 3L, 3L, 6L, 5L, 3L, 3L,
1L, 3L, 5L, 3L, 5L, 3L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 4L,
3L, 3L, 1L, 3L, 1L, 3L, 3L, 5L, 5L, 6L, 3L, 5L, 3L, 5L, 3L,
3L, 5L, 3L, 5L, 3L, 3L, 3L, 1L, 1L, 3L, 3L, 5L, 5L, 3L, 1L,
1L, 3L, 3L, 5L, 3L, 3L, 1L, 5L), .Label = c(" Divorced",
" Married-AF-spouse", " Married-civ-spouse", " Married-spouse-absent",
" Never-married", " Separated", " Widowed"), class = "factor"),
occ = structure(c(2L, 5L, 7L, 7L, 11L, 5L, 9L, 5L, 11L, 5L,
5L, 11L, 2L, 13L, 4L, 15L, 6L, 8L, 13L, 5L, 11L, 9L, 6L,
15L, 14L, 14L, 4L, 1L, 5L, 4L, 12L, 13L, 5L, 2L, 9L, 8L,
8L, 2L, 13L, 11L, 8L, 11L, 14L, 2L, 7L, 11L, 8L, 5L, 4L,
11L, 5L, 9L, 11L, 5L, 5L, 14L, 8L, 9L, 2L, 8L, 13L, 1L, 15L,
11L, 14L, 4L, 2L, 2L, 5L, 1L, 11L, 13L, 13L, 8L, 11L, 9L,
2L, 1L, 9L, 6L, 13L, 9L, 9L, 13L, 4L, 13L, 12L, 11L, 13L,
11L, 11L, 4L, 8L, 13L, 12L, 7L, 11L, 13L, 5L, 9L), .Label = c(" ?",
" Adm-clerical", " Armed-Forces", " Craft-repair", " Exec-managerial",
" Farming-fishing", " Handlers-cleaners", " Machine-op-inspct",
" Other-service", " Priv-house-serv", " Prof-specialty",
" Protective-serv", " Sales", " Tech-support", " Transport-moving"
), class = "factor"), relationship = structure(c(2L, 1L,
2L, 1L, 6L, 6L, 2L, 1L, 2L, 1L, 1L, 1L, 4L, 2L, 1L, 1L, 4L,
5L, 1L, 5L, 1L, 5L, 1L, 1L, 5L, 1L, 4L, 1L, 2L, 1L, 2L, 4L,
4L, 4L, 1L, 5L, 4L, 6L, 1L, 1L, 1L, 1L, 1L, 5L, 2L, 1L, 1L,
5L, 1L, 2L, 6L, 4L, 6L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 2L, 1L, 2L, 6L, 1L, 4L, 4L, 4L, 1L, 2L, 3L, 4L, 1L,
1L, 4L, 1L, 2L, 1L, 6L, 1L, 2L, 4L, 1L, 1L, 2L, 2L, 1L, 5L,
5L, 6L, 1L, 2L, 1L, 1L, 5L, 4L), .Label = c(" Husband", " Not-in-family",
" Other-relative", " Own-child", " Unmarried", " Wife"), class = "factor"),
race = structure(c(5L, 5L, 5L, 3L, 3L, 5L, 3L, 5L, 5L, 5L,
3L, 2L, 5L, 3L, 2L, 1L, 5L, 5L, 5L, 5L, 5L, 3L, 3L, 5L, 5L,
5L, 5L, 2L, 5L, 5L, 5L, 3L, 5L, 5L, 3L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 3L, 5L, 5L, 5L, 5L, 4L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 3L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 3L, 2L, 5L, 5L, 5L, 5L, 5L, 3L
), .Label = c(" Amer-Indian-Eskimo", " Asian-Pac-Islander",
" Black", " Other", " White"), class = "factor"), sex = structure(c(2L,
2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L), .Label = c(" Female",
" Male"), class = "factor"), capGain = c(2174L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 14084L, 5178L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 5013L, 2407L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 14344L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), capLoss = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 2042L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1408L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1902L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1573L, 0L, 0L, 1902L, 0L, 0L, 0L), hours = c(40L,
13L, 40L, 40L, 40L, 40L, 16L, 45L, 50L, 40L, 80L, 40L, 30L,
50L, 40L, 45L, 35L, 40L, 50L, 45L, 60L, 20L, 40L, 40L, 40L,
40L, 40L, 60L, 80L, 40L, 52L, 44L, 40L, 40L, 15L, 40L, 40L,
25L, 38L, 40L, 43L, 40L, 50L, 40L, 35L, 40L, 38L, 40L, 40L,
43L, 40L, 30L, 60L, 55L, 60L, 40L, 40L, 40L, 48L, 40L, 40L,
40L, 40L, 45L, 58L, 40L, 40L, 40L, 50L, 40L, 32L, 40L, 70L,
40L, 20L, 40L, 40L, 2L, 22L, 40L, 30L, 40L, 40L, 48L, 40L,
35L, 40L, 50L, 40L, 50L, 40L, 40L, 25L, 35L, 40L, 50L, 60L,
48L, 40L, 40L), country = structure(c(40L, 40L, 40L, 40L,
6L, 40L, 24L, 40L, 40L, 40L, 40L, 20L, 40L, 40L, 1L, 27L,
40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 36L,
40L, 40L, 40L, 40L, 40L, 40L, 40L, 34L, 40L, 40L, 1L, 40L,
40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 1L,
17L, 40L, 40L, 40L, 27L, 34L, 40L, 40L, 40L, 1L, 40L, 40L,
40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 27L,
40L, 40L, 40L, 40L, 40L, 6L, 40L, 40L, 40L, 40L, 40L, 40L,
40L, 40L, 40L, 40L, 40L, 1L, 40L, 40L, 40L, 40L, 10L, 40L
), .Label = c(" ?", " Cambodia", " Canada", " China", " Columbia",
" Cuba", " Dominican-Republic", " Ecuador", " El-Salvador",
" England", " France", " Germany", " Greece", " Guatemala",
" Haiti", " Holand-Netherlands", " Honduras", " Hong", " Hungary",
" India", " Iran", " Ireland", " Italy", " Jamaica", " Japan",
" Laos", " Mexico", " Nicaragua", " Outlying-US(Guam-USVI-etc)",
" Peru", " Philippines", " Poland", " Portugal", " Puerto-Rico",
" Scotland", " South", " Taiwan", " Thailand", " Trinadad&Tobago",
" United-States", " Vietnam", " Yugoslavia"), class = "factor")), .Names = c("age",
"workClass", "fnlwgt", "education", "educationNum", "marital",
"occ", "relationship", "race", "sex", "capGain", "capLoss", "hours",
"country"), row.names = c(NA, 100L), class = "data.frame")
The code you provided constructs a factor with 1 level (<=50k) because the first vector input contains only 1Ls. You should assign these labels accordingly or use an easier way to construct your response variable - something like trainY <- as.factor(...).
I changed the way trainY is constructed to:
y <- structure(c(1L, 2L, 1L, 1L, 2L, 1L), .Label = c(" <=50K", " >50K"), class = "factor")
and after re-training the tree with same commands i got:
Decision tree:
V14 = Cuba: >50K (1)
V14 in {?,Cambodia,Canada,China,Columbia,Dominican-Republic,Ecuador,
El-Salvador,England,France,Germany,Greece,Guatemala,Haiti,
Holand-Netherlands,Honduras,Hong,Hungary,India,Iran,Ireland,Italy,
Jamaica,Japan,Laos,Mexico,Nicaragua,Outlying-US(Guam-USVI-etc),Peru,
Philippines,Poland,Portugal,Puerto-Rico,Scotland,South,Taiwan,Thailand,
Trinadad&Tobago,United-States,Vietnam,Yugoslavia}: <=50K (5/1)
Make sure you don't have only one class in the response when passing args to C5.0. hth
UPDATE
After plotting some of the predictors vs response I noticed that education and educationNum show the clearest division in the data (Doctorate implies >50K immediately). Next step was to tweak some of the very useful C5.0 Control options - they are well documented in the C5.0 package documentation and the official informal tutorial page - check them out they give you broad control over the classification controls.
For example:
C5.0(x = trainX,y = trainY,control = C5.0Control(subset = T, winnow = T,minCases = 4,fuzzyThreshold = T))
Decision tree:
educationNum <= 13 (14.5): <=50K (95/20)
educationNum >= 16 (14.5): >50K (5)
similiarly, doing some "feature engineering" which in this case meant just leaving out some of the columns from the original dataframe produced :
C5.0(x = trainX[ ,c(1:5, 9:13)], y = trainY)
Decision tree:
educationNum <= 14: <=50K (95/20)
educationNum > 14: >50K (5)
I believe that there is no one general "out of the box" C5.0 defaults setting that would produce satisfying results for all kinds of problems, so it really comes down to trying out different parameter settings, features etc...but as with all things R there is plenty of material around to give you some direction.
Related
Undesired error in length() when running a glmer() model in R
My dataset looks like this: > head(GLM_df) hour Feeding Foraging Standing ID Area Feeding_Foraging 1 0 0.119 0.789 0.0339 41361 Seronera 0.908 2 1 0.0920 0.819 0.0339 41361 Seronera 0.911 3 2 0.0847 0.824 0.0678 41361 Seronera 0.909 4 3 0.233 0.632 0.132 41361 Seronera 0.866 5 4 0.254 0.597 0.124 41361 Seronera 0.852 6 5 0.245 0.664 0.0832 41361 Seronera 0.909 And I'm trying to run a glmer() model as such to verify an interaction, the error associated is found below: > m <- glmer(cbind(Feeding_Foraging,Standing) ~ poly(hour,2)*Area+(1|ID) , data=GLM_df , family=binomial) Error in length(value <- as.numeric(value)) == 1L : (maxstephalfit) PIRLS step-halvings failed to reduce deviance in pwrssUpdate In addition: Warning message: In eval(family$initialize, rho) : non-integer counts in a binomial glm! I apologize if I'm not asking on the right forum, but does somebody know what is the cause of this error? I've been using this dataset to run other glmer() models not having such issue, so I hope somebody can help me. I can provide a dput() sample of the data below: > dput(GLM_df) structure(list(hour = c(0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L), Feeding = c(0.118579234700529, 0.0919594065024507, 0.0846994533575204, 0.233092895639896, 0.254098360072561, 0.244523639258233, 0.238513660654777, 0.245289616923379, 0.211748633393801, 0.253514225911475, 0.275555554923133, 0.222477230819087, 0.232641165221989, 0.238368461591879, 0.30265937999754, 0.433661201190504, 0.178745053292422, 0.12125395428024, 0.10605844594333, 0.163238946470857, 0.174611180767811, 0.22483854891269, 0.177868852050793, 0.183918813004901, 0.241998438164344, 0.161698956409812, 0.158105646267371, 0.36138433432542, 0.468670308578279, 0.333151183206247, 0.32072859671381, 0.301413227120555, 0.295571885509692, 0.313952640445209, 0.343315117609149, 0.309435336266141, 0.345573769698683, 0.307176684176607, 0.322987248803344, 0.303788706042306, 0.266520946564997, 0.179710144515087, 0.151781420416677, 0.272293057460473, 0.384777516681307, 0.358157688483229, 0.370418942683556, 0.295571885509692, 0.194038747691774, 0.0980730512560762, 0.104719324151116, 0.287394007254483, 0.360255008280653, 0.356867030146353, 0.303788706042306, 0.297908422154037, 0.295883423728938, 0.309435336266141, 0.335409835295781, 0.294754097684171, 0.329763205071946, 0.311693988355675, 0.252969034027794, 0.320554854245385, 0.269908924699298, 0.114670029160951, 0.145400728263743, 0.208925318281884, 0.252065573191981, 0.343637782193368, 0.234552332374672, 0.25071038193826, 0.139938227286338, 0.127049180036281, 0.0779234970889187, 0.271038250744065, 0.37923497180722, 0.365027321566604, 0.313661201465914, 0.342076501947147, 0.292896174191167, 0.283060108639971, 0.271038250744065, 0.238251365573412, 0.196721311023918, 0.191256830162143, 0.16601092858074, 0.0626775954845651, 0.134426229199678, 0.105704917790185, 0.11195058182907, 0.140192198660723, 0.14806719253611, 0.21262483463543, 0.226733921295516, 0.21891551021636, 0.120612021581109, 0.140939890386914, 0.0931693986932724, 0.2142076497816, 0.228415300022216, 0.194244079699913, 0.181821493207477, 0.186922931547631, 0.153588342088304, 0.15187488188245, 0.135519125372033, 0.171657558804575, 0.144302772386887, 0.113322027250751, 0.0931693986932724, 0.0657666343717217, 0.126775955993192, 0.0912147959234835, 0.0966201171633936, 0.143219075677262, 0.127049180036281, 0.145683059774935, 0.171657558804575, 0.140731399424803, 0.238570126957016, 0.109339294334254, 0.14013909555517, 0.190856101565613, 0.175240248325904, 0.217486338298665, 0.251366119641673, 0.295081966535877, 0.278688523950551, 0.268852458399355, 0.349726775153633, 0.328961747878886, 0.351912567498343, 0.284153004812326, 0.220218578729553, 0.179437360446302, 0.283460837236502, 0.156693988711413, 0.114187411193102, 0.207187893597627, 0.198761383878981, 0.22134790477432, 0.199890709923748, 0.218466176246294), Foraging = c(0.78939890529209, 0.81876138245603, 0.824408012679865, 0.632422585069486, 0.59741347768171, 0.66404371432296, 0.599672129771244, 0.632422585069486, 0.629034606935185, 0.575956282831139, 0.525136610816626, 0.588378869323575, 0.577085608875906, 0.574826956786372, 0.482222221115483, 0.336377829048438, 0.677595626860163, 0.811985426187429, 0.797304187605459, 0.744225863501412, 0.727285972829908, 0.702440799845036, 0.721639342606074, 0.744225863501412, 0.593480307663729, 0.692276865442133, 0.705828777979336, 0.29136611954987, 0.178520386307389, 0.320647930567756, 0.343470886718772, 0.422913132626516, 0.393706424572198, 0.350480496651808, 0.350091073877751, 0.339966081752254, 0.289107467460336, 0.294403617187519, 0.226644054501503, 0.185602280400827, 0.465282330443979, 0.671948996636328, 0.677595626860163, 0.525136610816626, 0.359125682235886, 0.398652093802729, 0.407725644438271, 0.496903459697453, 0.519489980592792, 0.647103823651456, 0.618870672532282, 0.247583017506598, 0.159987856341983, 0.170810564270999, 0.290898812221001, 0.315807961804469, 0.2952380945605, 0.274543055710583, 0.21405861848537, 0.274947456283643, 0.241067674940635, 0.254098360072561, 0.192437158028286, 0.1589743586095, 0.334732239668921, 0.591766847457876, 0.587638966052866, 0.500018841889913, 0.436807180886641, 0.401884302827407, 0.44922080447396, 0.438017173077463, 0.748633878063245, 0.820765025438681, 0.896174861331183, 0.336612021085371, 0.116546447819948, 0.204633879311769, 0.282720933965792, 0.313952640445209, 0.293235348865346, 0.217959926640019, 0.244687309699503, 0.267759562227, 0.256357012162095, 0.20666666619235, 0.110109289364776, 0.0532396563961557, 0.284590163281268, 0.810928959887485, 0.790163932612739, 0.619999998577049, 0.523384208333367, 0.47682655223493, 0.493009231956877, 0.637874503906291, 0.632422585069486, 0.726775954616143, 0.817486336921616, 0.340983605774792, 0.142779078516963, 0.193598750531475, 0.256357012162095, 0.254682494233647, 0.206783493024567, 0.19198542761038, 0.221428570920375, 0.213793102957603, 0.203278688058049, 0.194157208465701, 0.112932604476694, 0.0948633877604228, 0.380582877086458, 0.787978140268028, 0.810928959887485, 0.719125681409657, 0.625136610587118, 0.562404370293935, 0.366120217738959, 0.535519124454, 0.655009105964824, 0.782513659406253, 0.757377047442085, 0.18996877395901, 0.158105646267371, 0.182574377237322, 0.24367381196702, 0.248087431124608, 0.269869982421893, 0.283586317908142, 0.23846153791425, 0.29272131080359, 0.220218578729553, 0.13834244048395, 0.101639344029024, 0.0846994533575204, 0.23846153791425, 0.745355189546179, 0.686338796239004, 0.605318759995079, 0.500936767000192, 0.414375787195254, 0.393442622047837, 0.509364988467295), Standing = c(0.0338797813430082, 0.0338797813430082, 0.0677595626860163, 0.131754705222809, 0.124225864924363, 0.0831594632964746, 0.162622950446439, 0.101639344029024, 0.112932604476694, 0.0931693986932724, 0.0975737702678635, 0.101639344029024, 0.12046144477514, 0.128743169103431, 0.137059115433078, 0.14761904728025, 0.0677595626860163, 0.0338797813430082, 0.0338797813430082, 0.0639951425367932, 0.0423497266787602, 0.0677595626860163, 0.107285974252859, 0.054207650148813, 0.0790528231336857, 0.0609836064174147, 0.0451730417906775, 0.195749847759603, 0.229629629102611, 0.225865208953388, 0.198259461192418, 0.160928961379289, 0.183201780595526, 0.203278688058049, 0.149321999252517, 0.198605614769358, 0.212958625584623, 0.281462798849606, 0.306128024277895, 0.398379497860889, 0.111677797760286, 0.0677595626860163, 0.0547288775540901, 0.0931693986932724, 0.145830363172079, 0.153350589236774, 0.105403764178248, 0.149071037909236, 0.152459016043537, 0.135519125372033, 0.119882303213721, 0.254098360072561, 0.296740153831865, 0.255227686117328, 0.178182553729895, 0.206102003169966, 0.186338797386545, 0.175045536938875, 0.264028640811029, 0.235903662684649, 0.235855400887864, 0.189259468191977, 0.333151183206247, 0.403169397981797, 0.203278688058049, 0.0884638735067435, 0.116461748366591, 0.127819175066803, 0.183918813004901, 0.155538996165628, 0.179710144515087, 0.15951730382333, 0.190573770054421, 0.167140254625507, 0.11067395238716, 0.392349725875482, 0.526775955075159, 0.469945354112694, 0.421857922529069, 0.365901638504488, 0.43278688425262, 0.506010927800412, 0.515846993351608, 0.493989069904506, 0.555191255556392, 0.608743168001792, 0.768306009165636, 0.947540981431873, 0.590163933071755, 0.169398906715041, 0.163752276491206, 0.297658078942143, 0.42228727459678, 0.412398717726961, 0.432306009936784, 0.283743168747693, 0.300400727908006, 0.183201780595526, 0.132573057429162, 0.444808742148526, 0.6426229493448, 0.637158468483024, 0.575956282831139, 0.58688524455469, 0.657923495757771, 0.690710380928424, 0.664480872791902, 0.633879779965959, 0.690710380928424, 0.731147539305563, 0.828415298645167, 0.933333331191257, 0.504918031628057, 0.161580495635885, 0.141411261257773, 0.231511839177222, 0.389617485444594, 0.325245900892878, 0.467759561767984, 0.370341058128744, 0.244523639258233, 0.255094824229708, 0.184927139830586, 0.643715845517155, 0.774863386199767, 0.676502730687808, 0.544262293832841, 0.456830600044432, 0.468852457940339, 0.48415300435331, 0.450273223010302, 0.43497267659733, 0.449180326837947, 0.608743168001792, 0.724590162271432, 0.816393440749261, 0.525683058902804, 0.196825396373666, 0.2766848809679, 0.298142075818472, 0.393247462017059, 0.468475597191251, 0.426885244921903, 0.380496005852245), ID = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L), .Label = c("41361", "41365", "41366", "41366bis", "41367", "41368"), class = "factor"), Area = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Loliondo", "Seronera"), class = "factor"), Feeding_Foraging = c(0.907978139992619, 0.910720788958481, 0.909107466037385, 0.865515480709382, 0.851511837754272, 0.908567353581193, 0.838185790426022, 0.877712201992865, 0.840783240328986, 0.829470508742613, 0.800692165739759, 0.810856100142662, 0.809726774097895, 0.81319541837825, 0.784881601113022, 0.770039030238942, 0.856340680152585, 0.933239380467668, 0.903362633548788, 0.90746480997227, 0.901897153597719, 0.927279348757726, 0.899508194656866, 0.928144676506314, 0.835478745828073, 0.853975821851945, 0.863934424246708, 0.65275045387529, 0.647190694885669, 0.653799113774003, 0.664199483432583, 0.724326359747071, 0.68927831008189, 0.664433137097017, 0.6934061914869, 0.649401418018395, 0.634681237159019, 0.601580301364126, 0.549631303304847, 0.489390986443134, 0.731803277008976, 0.851659141151415, 0.82937704727684, 0.797429668277099, 0.743903198917193, 0.756809782285958, 0.778144587121826, 0.792475345207145, 0.713528728284566, 0.745176874907532, 0.723589996683398, 0.534977024761081, 0.520242864622636, 0.527677594417352, 0.594687518263307, 0.613716383958506, 0.591121518289437, 0.583978391976724, 0.54946845378115, 0.569701553967814, 0.570830880012581, 0.565792348428236, 0.44540619205608, 0.479529212854885, 0.604641164368219, 0.706436876618826, 0.733039694316609, 0.708944160171797, 0.688872754078621, 0.745522085020775, 0.683773136848632, 0.688727555015723, 0.888572105349583, 0.947814205474962, 0.974098358420102, 0.607650271829437, 0.495781419627168, 0.569661200878373, 0.596382135431706, 0.656029142392356, 0.586131523056514, 0.501020035279991, 0.515725560443569, 0.506010927800412, 0.453078323186013, 0.397923496354493, 0.276120217945516, 0.115917251880721, 0.419016392480946, 0.916633877677671, 0.902114514441809, 0.760192197237773, 0.671451400869477, 0.68945138687036, 0.719743153252393, 0.856790014122652, 0.753034606650595, 0.867715845003057, 0.910655735614888, 0.555191255556392, 0.371194378539179, 0.387842830231389, 0.438178505369572, 0.441605425781279, 0.360371835112871, 0.34386030949283, 0.356947696292407, 0.385450661762178, 0.347581460444935, 0.307479235716452, 0.206102003169966, 0.160630022132145, 0.50735883307965, 0.879192936191512, 0.907549077050879, 0.862344757086919, 0.752185790623399, 0.70808743006887, 0.537777776543534, 0.676250523878803, 0.89357923292184, 0.891852953740506, 0.897516142997256, 0.380824875524623, 0.333345894593276, 0.400060715535987, 0.495039931608694, 0.543169397660485, 0.548558506372443, 0.552438776307497, 0.588188313067882, 0.621683058682476, 0.572131146227896, 0.422495445296276, 0.321857922758577, 0.264136813803823, 0.521922375150751, 0.902049178257592, 0.800526207432105, 0.812506653592706, 0.699698150879173, 0.635723691969573, 0.593333331971585, 0.727831164713589)), row.names = c(NA, -144L), vars = "hour", indices = list(c(0L, 24L, 48L, 72L, 96L, 120L), c(1L, 25L, 49L, 73L, 97L, 121L), c(2L, 26L, 50L, 74L, 98L, 122L), c(3L, 27L, 51L, 75L, 99L, 123L), c(4L, 28L, 52L, 76L, 100L, 124L), c(5L, 29L, 53L, 77L, 101L, 125L), c(6L, 30L, 54L, 78L, 102L, 126L), c(7L, 31L, 55L, 79L, 103L, 127L), c(8L, 32L, 56L, 80L, 104L, 128L), c(9L, 33L, 57L, 81L, 105L, 129L), c(10L, 34L, 58L, 82L, 106L, 130L), c(11L, 35L, 59L, 83L, 107L, 131L), c(12L, 36L, 60L, 84L, 108L, 132L), c(13L, 37L, 61L, 85L, 109L, 133L), c(14L, 38L, 62L, 86L, 110L, 134L), c(15L, 39L, 63L, 87L, 111L, 135L), c(16L, 40L, 64L, 88L, 112L, 136L), c(17L, 41L, 65L, 89L, 113L, 137L), c(18L, 42L, 66L, 90L, 114L, 138L), c(19L, 43L, 67L, 91L, 115L, 139L), c(20L, 44L, 68L, 92L, 116L, 140L), c(21L, 45L, 69L, 93L, 117L, 141L), c(22L, 46L, 70L, 94L, 118L, 142L), c(23L, 47L, 71L, 95L, 119L, 143L)), group_sizes = c(6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L), biggest_group_size = 6L, labels = structure(list( hour = 0:23), row.names = c(NA, -24L), class = "data.frame", vars = "hour"), class = c("grouped_df", "tbl_df", "tbl", "data.frame")) Any input is appreciated!
Specific data in secondary y axis
This language is still a bit alien to me. I want to make a complicate graph with two axis and data plotted by groups. The nature of my data STAT. I will write it as code, otherwise I cannot manage to publish the post: 4 time points ("0", "3", "5" and "7"), column Day. Data divided in 5 groups, column SNu ("1", "2", "3", "4", "5") or SNa (the actual name of each group). There are 4 values per group and time point, column Rep. Graph could plot the mean of these four values. Data1 based on the area between the actual measures of one day and the following day, column SAr (some values are 0, between 0 and 205, some of them with decimals). I want to plot this in the primary y axis. Data2, column DW (values between 0 and 1, all of them with 4 decimals). I want to plot this in the secondary axis. I show below some modified data as an example. structure(list(Sname = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), .Label = c("H4.8", "S302", "S309", "S313", "T.m"), class = "factor"), Snumber = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), Day = c(0L, 3L, 5L, 7L, 0L, 3L, 5L, 7L, 0L, 3L, 5L, 7L, 0L, 3L, 5L, 7L, 0L, 3L, 5L, 7L, 0L, 3L, 5L, 7L, 0L, 3L, 5L, 7L, 0L, 3L, 5L, 7L, 0L, 3L, 5L, 7L, 0L, 3L, 5L, 7L, 0L, 3L, 5L, 7L, 0L, 3L, 5L, 7L, 0L, 3L, 5L, 7L, 0L, 3L, 5L, 0L, 3L, 5L, 7L, 0L, 3L, 5L, 7L, 0L, 3L, 5L, 7L, 0L, 3L, 5L, 7L, 0L, 3L, 5L, 7L, 0L, 3L, 5L, 7L), Replica = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L), Diff = c(0L, 0L, 160L, 200L, 0L, 10L, 140L, 160L, 0L, 0L, 50L, 170L, 0L, 10L, 70L, 150L, 0L, 10L, 210L, 140L, 0L, 0L, 0L, 120L, 0L, 30L, 70L, 160L, 0L, 20L, 110L, 140L, 0L, 30L, 190L, 150L, 0L, 10L, 80L, 130L, 0L, 10L, 90L, 140L, 0L, 0L, 170L, 170L, 0L, 80L, 200L, 410L, 0L, 10L, 150L, 0L, 90L, 200L, 390L, 0L, 50L, 220L, 600L, 0L, 0L, 0L, 100L, 0L, 0L, 0L, 70L, 0L, 20L, 10L, 150L, 0L, 20L, 40L, 140L), Sum = c(0L, 0L, 160L, 360L, 0L, 10L, 150L, 310L, 0L, 0L, 50L, 220L, 0L, 10L, 80L, 230L, 0L, 10L, 220L, 360L, 0L, 0L, 0L, 120L, 0L, 30L, 100L, 260L, 0L, 20L, 130L, 270L, 0L, 30L, 220L, 370L, 0L, 10L, 90L, 220L, 0L, 10L, 100L, 240L, 0L, 0L, 170L, 340L, 0L, 80L, 280L, 690L, 0L, 10L, 160L, 0L, 90L, 290L, 680L, 0L, 50L, 270L, 870L, 0L, 0L, 0L, 100L, 0L, 0L, 0L, 70L, 0L, 20L, 30L, 180L, 0L, 20L, 60L, 200L), Sumarea = structure(c(1L, 1L, 17L, 33L, 1L, 2L, 16L, 29L, 1L, 1L, 3L, 22L, 1L, 2L, 9L, 22L, 1L, 2L, 22L, 32L, 1L, 1L, 1L, 14L, 1L, 20L, 12L, 23L, 1L, 13L, 15L, 24L, 1L, 20L, 22L, 31L, 1L, 2L, 11L, 21L, 1L, 2L, 12L, 23L, 1L, 1L, 18L, 31L, 1L, 4L, 27L, 7L, 1L, 2L, 17L, 1L, 6L, 28L, 5L, 1L, 30L, 25L, 10L, 1L, 1L, 1L, 12L, 1L, 1L, 1L, 8L, 1L, 13L, 26L, 17L, 1L, 13L, 6L, 19L), .Label = c("0", "1,6", "12,5", "13,3", "147,5", "15", "152,5", "17,5", "20", "205", "22,5", "25", "3,3", "30", "32,5", "37,5", "40", "42,5", "45", "5", "52,5", "55", "57,5", "62,5", "67,5", "7,5", "70", "72,5", "75", "8,3", "85", "87,5", "90"), class = "factor"), Sumarea10 = c(0L, 0L, 400L, 900L, 0L, 16L, 375L, 750L, 0L, 0L, 125L, 550L, 0L, 16L, 200L, 550L, 0L, 16L, 550L, 875L, 0L, 0L, 0L, 300L, 0L, 50L, 250L, 575L, 0L, 33L, 325L, 625L, 0L, 50L, 550L, 850L, 0L, 16L, 225L, 525L, 0L, 16L, 250L, 575L, 0L, 0L, 425L, 850L, 0L, 133L, 700L, 1525L, 0L, 16L, 400L, 0L, 150L, 725L, 1475L, 0L, 83L, 675L, 2050L, 0L, 0L, 0L, 250L, 0L, 0L, 0L, 175L, 0L, 33L, 75L, 400L, 0L, 33L, 150L, 450L), Dweight = structure(c(1L, 6L, 34L, 38L, 1L, 7L, 32L, 45L, 1L, 8L, 31L, 48L, 1L, 9L, 30L, 44L, 1L, 11L, 37L, 50L, 1L, 11L, 33L, 49L, 1L, 13L, 35L, 51L, 1L, 18L, 36L, 52L, 1L, 21L, 47L, 53L, 1L, 19L, 43L, 54L, 1L, 20L, 46L, 56L, 1L, 22L, 42L, 55L, 1L, 17L, 28L, 39L, 1L, 15L, 27L, 1L, 13L, 26L, 41L, 1L, 17L, 29L, 40L, 1L, 5L, 10L, 24L, 1L, 3L, 14L, 24L, 1L, 4L, 16L, 23L, 1L, 2L, 12L, 25L), .Label = c("0", "0,0003", "0,0006", "0,0007", "0,0008", "0,0011", "0,0017", "0,0026", "0,0033", "0,004", "0,0045", "0,0048", "0,005", "0,0051", "0,0053", "0,0055", "0,0056", "0,006", "0,007", "0,0074", "0,0082", "0,0086", "0,0142", "0,0204", "0,0222", "0,0333", "0,0342", "0,0345", "0,038", "0,0423", "0,0426", "0,0637", "0,0668", "0,0679", "0,0736", "0,0808", "0,0922", "0,0952", "0,0986", "0,0989", "0,0996", "0,1078", "0,1215", "0,1242", "0,1349", "0,1483", "0,1512", "0,1576", "0,1682", "0,1731", "0,1949", "0,2099", "0,262", "0,2676", "0,2742", "0,2808"), class = "factor"), Wweight = structure(c(1L, 3L, 40L, 42L, 1L, 4L, 37L, 44L, 1L, 8L, 26L, 48L, 1L, 9L, 24L, 43L, 1L, 10L, 41L, 49L, 1L, 11L, 39L, 46L, 1L, 12L, 35L, 50L, 1L, 14L, 38L, 53L, 1L, 22L, 52L, 57L, 1L, 20L, 47L, 58L, 1L, 17L, 51L, 60L, 1L, 21L, 45L, 59L, 1L, 15L, 34L, 54L, 1L, 19L, 32L, 1L, 16L, 31L, 56L, 1L, 18L, 36L, 55L, 1L, 7L, 13L, 27L, 1L, 6L, 29L, 25L, 1L, 5L, 30L, 23L, 1L, 2L, 33L, 28L), .Label = c("0", "0,0089", "0,0105", "0,0136", "0,0144", "0,0147", "0,0152", "0,0201", "0,0265", "0,0339", "0,0345", "0,0371", "0,045", "0,0463", "0,0569", "0,0583", "0,0587", "0,0596", "0,0602", "0,0649", "0,069", "0,0834", "0,1264", "0,1829", "0,1897", "0,1909", "0,1974", "0,2309", "0,3", "0,344", "0,3491", "0,3547", "0,364", "0,3729", "0,3756", "0,3932", "0,4357", "0,4361", "0,451", "0,4634", "0,479", "0,5109", "0,6594", "0,7182", "0,7423", "0,7865", "0,7938", "0,8406", "0,8407", "0,9152", "0,9347", "0,9675", "1", "1,0908", "1,1366", "1,1465", "1,6905", "1,7799", "1,8875", "1,9493"), class = "factor")), class = "data.frame", row.names = c(NA, -79L)) #Pretreat dataframe by creating factors for every column. STAT<- read.table("Biomass.txt", header=TRUE, fill=TRUE) SNa <- as.factor(STAT$Sname) SNu <- as.factor(STAT$Snumber) Day <- as.numeric(STAT$Day) Rep <- as.numeric(STAT$Replica) Dif <- as.numeric(STAT$Diff) Sum <- as.numeric(STAT$Sum) SAr10 <- as.numeric(STAT$Sumarea10) SAr <- c(SAr10/10) DW <- as.numeric(STAT$Dweight) WW <- as.numeric(STAT$Wweight) #I first tried to plot Dataone (`SAr`) as follows: points1 <- geom_point(aes(colour = SNa), size =.8) lines1 <- geom_smooth(method = loess, aes(colour = SNa), size =.5, se=TRUE, alpha=.2) text1 <- labs(title=expression (Biomass~and~CO[2]~production~summed~ area), x=expression(Time~" "~(days)), y=expression(CO[2]~production~sum~" "~(ppm))) g <- ggplot(data=STAT, aes(x=Day, y=SAr, group=SNa, fill=SNa, colour=SNa), par(mar=Marg)) g <- g + points1 + lines1 + text1 This is the result: So far so good, but here start the problems. 1. SHADE I would like to shade the area below the graphs. I have tried: area1 <- geom_ribbon(data = STAT[STAT$Snumber == '1',], aes(ymin = 0, ymax = predict(loess(Day ~ Sumarea))), alpha = 0.3, fill = "#114477") g <- g + points1 + lines1 + text1 + area1 plot(g) returns: Error in loess(Day ~ Sumarea) : predictors must all be numeric I have tried to put the numeric factors I created at the beginning, but Day and SAr do not have the same length Error in model.frame.default(formula = Day ~ SAr) : variable lengths differ (found for 'SAr'). I have also tried to make this with a density function and a geom_area but none of them resulted in what I wanted. 2. PLOT DATA2 I want the Datatwo (DW) attachted to the secondary y axis. #Secondary y axis y2 <- scale_y_continuous(sec.axis = sec_axis(~./150, name = "Dry weight")) #Grouped bars per time point bars2 <- geom_bar(aes(factor(Day), DW), stat="identity", position = "dodge") g <- g + points1 + lines1 + text1 + y2 + bars2 plot(g) returns: Error: Discrete value supplied to continuous scale I know that there cannot be a continuous scale on variable of the factor type (Plotting with ggplot2: "Error: Discrete value supplied to continuous scale" on categorical y-axis). But their solution does not work for me either. ggplot(STAT[STAT$SNu == 1,], aes(x = STAT$Day, y = STAT$DW)) + scale_x_continuous(limits=c(0,7)) + scale_y_continuous(limits=c(0,1)) Returning Error: Aesthetics must be either length 1 or the same as the data (79): x, y` If anyone can help me with this two issues it would be super appreciated. As I am new in this code, I also encourage you to ask me about specific details that might have relevance and I did not add in the post. Also any improvement in my code even not related with my questions would be very welcome.
Use rbind() in nested for loop with apply() in r
How can you use rbind in a for loop that runs through a list of dataframes? I tried to follow Looping through list of data frames in R but receive the following: Error in apply(dataFramesList, 2, function(x) { : dim(X) must have a positive length I have two dataframes, dfTraining and dfAccuracy (code to reproduce dataframes is below), and need to add a row for any of the crop types missing from either of two columns, CROP or CROP_LABEL. I believe my problem is in my last line of code. My code block is: dataFramesList <- list(dfTraining, dfAccuracy) apply(dataFramesList, 2, function(x){ cropNumbers <- seq(1,23, by = 1) cropNumbers <- cropNumbers[-c(3)] cropNumbers <- append(cropNumbers, 34) listofCROPandCROP_LABELColumns <- list(dataFrameList$CROP, dataFrameList$CROP_LABEL) missingCROP <- NULL for (i in listofCROPandCROP_LABELColumns){ for (j in cropNumbers){ if (!j %in% i){ # If crop number is missing from CROP_LABEL, add missingCROP observation (row) # Make row for missing crop type missingCrop <- list(FREQUENCY = 0, AA = 1, CROP = j, CROP_LABEL = j, ACRES = 0) dataFrameList <- rbind(dataFrameList, missingCrop) } } } }) My dfAccuracy dataframe: structure(list(FREQUENCY = c(4L, 2L, 1L, 1L, 1L, 1L, 65L, 1L, 1L, 4L, 1L, 5L, 5L, 2L, 4L, 1L, 1L, 1L, 1L, 4L, 9L, 2L, 1L, 1L, 1L, 2L, 4L, 1L, 2L, 18L, 1L, 10L, 3L, 1L, 7L, 1L, 1L, 1L, 3L, 1L, 7L, 1L), AA = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), CROP = c(1L, 4L, 12L, 13L, 14L, 18L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 7L, 13L, 13L, 13L, 13L, 14L, 14L, 14L, 18L, 18L, 18L, 18L, 18L, 19L, 19L, 21L, 21L, 21L, 21L), CROP_LABEL = c(1L, 4L, 14L, 13L, 12L, 18L, 1L, 4L, 5L, 6L, 18L, 1L, 4L, 6L, 14L, 18L, 12L, 14L, 18L, 1L, 6L, 14L, 18L, 18L, 4L, 6L, 13L, 21L, 12L, 14L, 18L, 1L, 6L, 14L, 18L, 21L, 1L, 19L, 6L, 13L, 21L, 34L), ACRES = c(331.737184484, 193.772138572, 26.48543619, 73.2696289437, 112.470306056, 66.6556450342, 3905.71121736, 24.9581079934, 39.9287379709, 259.662359273, 85.2786247851, 306.051491303, 368.342995232, 154.82030835, 265.754349805, 70.3722566979, 35.4066607701, 139.336463432, 58.4307705147, 251.070357093, 471.031628349, 150.965736858, 28.2780117926, 35.3426930108, 34.5730542194, 67.7383953308, 144.442123948, 33.2746560126, 69.4072817311, 1219.65459596, 92.4840910734, 582.983473317, 191.957841327, 35.708775262, 319.638682538, 60.6889287642, 82.6244195055, 36.2898952104, 267.422844756, 72.8352758659, 489.746546145, 65.5392893502)), row.names = c(25L, 26L, 27L, 29L, 30L, 31L, 60L, 61L, 62L, 63L, 64L, 65L, 66L, 67L, 68L, 69L, 70L, 71L, 72L, 73L, 74L, 75L, 76L, 77L, 78L, 79L, 80L, 81L, 82L, 83L, 84L, 85L, 86L, 87L, 88L, 89L, 90L, 91L, 92L, 93L, 94L, 95L ), class = "data.frame") and my dfTraining dataframe is: structure(list(FREQUENCY = c(7L, 1L, 1L, 4L, 2L, 6L, 1L, 107L, 1L, 21L, 1L, 1L, 1L, 2L, 1L, 19L, 3L, 1L, 1L, 12L, 1L, 2L, 32L, 2L, 2L, 29L, 2L, 18L, 1L), AA = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), CROP = c(1L, 1L, 4L, 4L, 12L, 13L, 21L, 1L, 1L, 4L, 4L, 5L, 5L, 5L, 5L, 6L, 6L, 7L, 12L, 13L, 14L, 14L, 14L, 18L, 18L, 18L, 19L, 21L, 34L), CROP_LABEL = c(1L, 4L, 1L, 4L, 12L, 13L, 21L, 1L, 6L, 4L, 6L, 1L, 5L, 14L, 18L, 6L, 14L, 1L, 12L, 13L, 1L, 6L, 14L, 6L, 14L, 18L, 19L, 21L, 34L), ACRES = c(624.940370218, 26.9188766351, 37.8773839813, 291.79294767, 140.949264214, 391.571023675, 44.5217011939, 6806.02216989, 72.7500299887, 1676.12121152, 14.8739557721, 67.0700291739, 59.7438207953, 82.6713019474, 75.62666152, 1370.78710769, 145.215281276, 41.7380537313, 66.5236760194, 679.91208779, 70.9661875374, 38.8514254734, 1749.63365551, 109.917242057, 79.7758083723, 1660.85759895, 96.8771921798, 1428.71888481, 69.473161379)), row.names = c(18L, 19L, 20L, 21L, 22L, 23L, 24L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L, 56L, 57L, 58L, 59L), class = "data.frame")
Re-assemble dataframe by index
What I'm trying to do is getting a dataframe where the repeated rows in the first column act as an index to copy the corresponding rows of other columns. I know this sound messy, and my inability to accurately state the issue is one of the reasons I'm having so many problems with this. I'll provide a reproducible example below. structure(list(Var1 = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L), .Label = c("2016-01", "2016-02", "2016-03", "2016-04", "2016-05", "2016-06", "2016-07", "2016-08", "2016-09", "2016-10", "2016-11", "2016-12", "2017-01", "2017-02", "2017-03", "2017-04", "2017-05"), class = "factor"), Var2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L ), .Label = c("B2B", "B2C", "B2K"), class = "factor"), Freq = c(5L, 13L, 8L, 13L, 36L, 5L, 18L, 1L, 12L, 24L, 22L, 6L, 24L, 15L, 11L, 26L, 1L, 338L, 285L, 291L, 232L, 142L, 42L, 92L, 9L, 46L, 34L, 45L, 35L, 30L, 31L, 36L, 56L, 9L, 0L, 1L, 0L, 0L, 0L, 0L, 7L, 0L, 13L, 0L, 1L, 0L, 0L, 0L, 0L, 2L, 0L)), .Names = c("Var1", "Var2", "Freq"), class = "data.frame", row.names = c(NA, -51L )) basically what I want is: On Var1 no repeated dates On the row where the date is repeated, take the value of Var2 and Freq and copy them in two new columns to the index of the unique date This must be done for every distinct level of Var2 Thank you in advance!
I think what your trying to explain is a dcast. Does this end up how you want it? library(reshape2) dcast(x,Var1~Var2,value.var="Freq")
A base R option would be xtabs(Freq~Var1 + Var2, df1)
Taking the mean of a group of data that is dependent on multiple other columns in the same row in R
I want to take the mean of animal abundance every 4 quadrats. The station # and the areaContro # should match for averaged groups of quadrats Fairly new to R My attempt: aaply(commData, station ~ areaContro & quadrat ~ station, .fun = mean, .expand = TRUE,.inform = TRUE, .drop = TRUE) The error: Error in splitter_a(.data, .margins, .expand) : 'pairlist' object cannot be coerced to type 'integer' structure(list(areaContro = c(29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L), station = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 8L, 8L), quadrat = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L), latitude = c(42.12521667, 42.12658333, 42.12681667, 42.12705, 42.12466667, 42.12631667, 42.12671667, 42.1272, 42.12671667, 42.12682833, 42.12726166, 42.12794499, 42.12771667, 42.1285, 42.12871667, 42.12896667, 42.12691667, 42.12748333, 42.12763333, 42.12785, 42.127, 42.12711818, 42.12735152, 42.12755152, 42.1264341, 42.1265095, 42.12664427, 42.12679211, 42.12703333, 42.12725), longitude = c(-67.33001667, -67.32823333, -67.3281, -67.3279, -67.31041667, -67.30906667, -67.30876667, -67.30843333, -67.29326667, -67.2942027, -67.29311937, -67.2929027, -67.27731667, -67.2768, -67.27655, -67.27628333, -67.25879572, -67.25684572, -67.25647905, -67.25616238, -67.2359, -67.23562265, -67.23512265, -67.23472265, -67.21841245, -67.21825004, -67.21814781, -67.21796007, -67.19853333, -67.19653333), scallops = c(1L, 0L, 0L, 0L, 4L, 0L, 7L, 3L, 3L, 3L, 1L, 2L, 2L, 1L, 2L, 0L, 2L, 2L, 2L, 2L, 45L, 11L, 4L, 8L, 12L, 9L, 11L, 11L, 4L, 10L), clappers = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 2L, 0L, 0L, 0L, 0L)), .Names = c("areaContro", "station", "quadrat", "latitude", "longitude", "scallops", "clappers" ), row.names = c(NA, 30L), class = "data.frame")
If you are new to R I strongly recommend taking a look at the tidyverse in particular dplyr for common data manipulation tasks. Your second argument of aaply is incorrect. According to the documentation it accepts a vector given the subscripts to split the data (e.g. 1 for rows). Also note that it accepts an array and results in an array. I'm confused about what variable(s) you want to average over and what the average should be conditioned on. I think you want the average grouped by station and quadrat (and areaContro but this is constant) Base R: tapply(data$scallops, data[c("station", "quadrat")], mean) dplyr: data %>% group_by(station, quadrat) %>% summarise(scallops_mean = mean(scallops))
I think that what you're trying to do could be accomplished simply like so: If you have: commData <- structure(list(areaContro = c(29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L), station = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 8L, 8L), quadrat = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L), latitude = c(42.12521667, 42.12658333, 42.12681667, 42.12705, 42.12466667, 42.12631667, 42.12671667, 42.1272, 42.12671667, 42.12682833, 42.12726166, 42.12794499, 42.12771667, 42.1285, 42.12871667, 42.12896667, 42.12691667, 42.12748333, 42.12763333, 42.12785, 42.127, 42.12711818, 42.12735152, 42.12755152, 42.1264341, 42.1265095, 42.12664427, 42.12679211, 42.12703333, 42.12725), longitude = c(-67.33001667, -67.32823333, -67.3281, -67.3279, -67.31041667, -67.30906667, -67.30876667, -67.30843333, -67.29326667, -67.2942027, -67.29311937, -67.2929027, -67.27731667, -67.2768, -67.27655, -67.27628333, -67.25879572, -67.25684572, -67.25647905, -67.25616238, -67.2359, -67.23562265, -67.23512265, -67.23472265, -67.21841245, -67.21825004, -67.21814781, -67.21796007, -67.19853333, -67.19653333), scallops = c(1L, 0L, 0L, 0L, 4L, 0L, 7L, 3L, 3L, 3L, 1L, 2L, 2L, 1L, 2L, 0L, 2L, 2L, 2L, 2L, 45L, 11L, 4L, 8L, 12L, 9L, 11L, 11L, 4L, 10L), clappers = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 2L, 0L, 0L, 0L, 0L)), .Names = c("areaContro", "station", "quadrat", "latitude", "longitude", "scallops", "clappers" ), row.names = c(NA, 30L), class = "data.frame") Check out ?aggregate: For scallops and only dependent on quadrats - just to show you how the function works: scallop <- aggregate(commData$scallops, by = list(commData$quadrat), FUN = mean) For all the requested variables: full_scallop <- aggregate(commData$scallops, by = list(commData$quadrat, commData$areaContro, commData$station), FUN = mean) Everything all together could look something like this: aggregate(cbind(commData$scallops, commData$clappers)~commData$quadrat+commData$areaContro+commData$station, FUN = mean)