Calculating column value using a glm - r

I have a dataset with about 20,000 lines of data. The data are grouped by year and area. I'm trying to calculate the length at which 50% of shrimp are female. When I run the following code, It just returns the value for the entire dataset, not unique values for each year/area pair.
The dataset is named shrimp.
library(MASS)
library(plyr)
Data
shrimp <- structure(list(cruise = c(1972L, 1972L, 1972L, 1972L, 1972L,
1972L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L, 2003L,
2003L, 2003L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L, 1985L,
1985L, 1985L, 1985L, 1985L, 1985L), areaname = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("Balboa",
"Chiniak", "Pavlof"), class = "factor"), size = c(9, 9.5, 10,
10.5, 11, 11.5, 15.5, 16, 16.5, 17, 17.5, 18, 18.5, 19, 19.5,
20, 13.5, 14, 14.5, 15, 15.5, 16, 16.5, 17, 17.5, 18, 18.5, 19
), male = c(49L, 61L, 92L, 57L, 46L, 9L, 151L, 64L, 30L, 0L,
1L, 0L, 0L, 0L, 0L, 0L, 638L, 625L, 666L, 447L, 393L, 214L, 119L,
33L, 12L, 13L, 7L, 6L), female = c(0L, 0L, 0L, 0L, 0L, 0L, 3L,
15L, 35L, 78L, 122L, 105L, 76L, 28L, 13L, 36L, 0L, 5L, 8L, 13L,
38L, 60L, 54L, 38L, 28L, 93L, 131L, 195L), total = c(49L, 61L,
92L, 57L, 46L, 9L, 154L, 79L, 65L, 78L, 123L, 105L, 76L, 28L,
13L, 36L, 638L, 630L, 674L, 460L, 431L, 274L, 173L, 71L, 40L,
106L, 138L, 201L)), .Names = c("cruise", "areaname", "size",
"male", "female", "total"), row.names = c(NA, 28L), class = "data.frame")
Model
cdata <- ddply(shrimp, c("cruise", "areaname"), summarise,
female = dose.p(glm(cbind(female, males) ~ size,
family=binomial(logit), data=shrimp), cf=1:2, p =.5))

There's nothing to test this against (yet) but perhaps:
library(MASS)
library(plyr)
cdata <- ddply(shrimp, c("cruise", "areaname"), summarise,
female = dose.p( glm(cbind(female, male) ~ size,
family=binomial("logit") ), cf=1:2, p = 0.5)
)
## cruise areaname female
## 1 1972 Balboa 67.15131
## 2 1985 Pavlof 16.88196
## 3 2003 Chiniak 16.37031

Related

How to use Predict in rms package for a multiple values?

I used cox model for Predict function rms package for individual values they are returning the correct result where as when I give multiple values it gives me weired results?
library(data.table)
library(survival)
library(survminer)
library(rms)
dput(df)
structure(list(ID = c(1001L, 1002L, 1003L, 1004L, 1006L, 1014L,
1015L, 1016L, 1018L, 1022L, 1024L, 1032L, 1040L, 1042L, 1049L,
1056L, 1059L, 1060L, 1066L, 1084L, 1087L, 1090L, 1093L, 1096L,
1097L, 1098L, 1099L, 1200L, 1205L, 1216L, 1221L, 1222L, 1225L,
1226L, 1233L, 1239L), Time = c(9L, 8L, 69L, 104L, 104L, 100L,
24L, 85L, 100L, 99L, 67L, 58L, 7L, 94L, 93L, 90L, 91L, 90L, 89L,
72L, 84L, 84L, 11L, 82L, 39L, 46L, 82L, 82L, 9L, 34L, 75L, 76L,
52L, 20L, 29L, 70L), Event = c(1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 0L), Risk1 = c(0.1,
0.03, 0.02, 0.05, 0.01, 0.04, 0.03, 0.06, 0.02, 0.03, 0, 0, 0.11,
0.01, 0.03, 0, 0.01, 0.01, 0.01, 0, 0, 0, 0.05, 0.01, 0, 0, 0,
0, 0.04, 0, 0.07, 0.01, 0.01, 0, 0, 0), Risk2 = c(88L, 49L, 60L,
46L, 50L, 60L, 38L, 74L, 39L, 65L, 80L, 35L, 54L, 40L, 54L, 55L,
60L, 38L, 64L, 74L, 71L, 57L, 55L, 49L, 42L, 30L, 63L, 46L, 47L,
58L, 34L, 72L, 50L, 60L, 73L, 51L), Risk3 = c(2L, 2L, 2L, 3L,
3L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 3L, 1L, 3L, NA, 2L, 3L, 2L, 2L,
2L, NA, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 2L
)), class = "data.frame", row.names = c(NA, -36L)
followed by
ddist <- datadist(df)
options(datadist='ddist')
cox_model <-cph(Surv(Time,Event==1) ~ Risk1 + Risk2 + Risk3, x = T, y = T, data = df, surv = T)
Predict(cox_model, Risk1=3.2, Risk2=1, Risk3=0.5)
but when I give multiple values as follows:
Predict(cox_model,
Risk1=c(5,3,2,1.5,1.5,2,3,2.5,4,2,5.5,6,3,3.5,4,5,4.5,3,2,6,3,5,4,1.8,3,3.5,1.5,2.5,3.5,5,6,4,1.5,5,4,2.5),
Risk2=c(1,1,1,1,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,1,0,0,0,1,1,0,1,1,0,1,1,0,0,0),
Risk3=c(0,0.07,0,0.03,0.01,0.02,0.01,0,0.05,0,0.04,0.03,0.01,0.01,0.01,0,0.11,0.03,0,0.05,0,0,0.02,0.04,0.01,0,0,0.01,0.03,0,0.01,0,0.06,0,0,0.1))
It gives me a ouput with 46566 rows where as I have only 36 rows to predict

Trouble making an object in phyloseq

I'm trying to make an object that I can use in the package phyloseq, but I can't seem to get it to work. Below is a small subset of my data. First is an otu table, the second has the taxa.
OTUs <- dput(OTU_table[1:5,])
structure(list(OTU_ID = c("OTU_1", "OTU_6", "OTU_16", "OTU_2",
"OTU_216"), V2T4r5Croot = c(3505L, 5L, 124L, 0L, 8L), V2T4r5Broot = c(18880L,
18390L, 1L, 10233L, 0L), R6T2r5Croot = c(82973L, 5195L, 444L,
93L, 7L), V2T2r2Broot = c(13747L, 79L, 1603L, 33L, 0L), R3T2r5Broot = c(11212L,
2L, 462L, 33977L, 0L), V2T2r2Croot = c(63779L, 354L, 5204L, 374L,
0L), R3T4r5Croot = c(60109L, 1518L, 4067L, 875L, 2L), R3T1r5Aroot = c(28412L,
3161L, 626L, 3465L, 131L), R3T4r2Croot = c(40569L, 110L, 575L,
8642L, 0L), V2T2r5Aroot = c(22800L, 2225L, 1334L, 12185L, 4L),
R6T2r5Broot = c(50017L, 5739L, 4199L, 0L, 0L), R6T1r1Broot = c(52756L,
0L, 35L, 490L, 0L), R6T1r2Croot = c(14828L, 10227L, 180L,
3973L, 10L), V2T1r6Aroot = c(40317L, 146L, 543L, 5975L, 36L
), R6T2r1Broot = c(13801L, 524L, 189L, 6121L, 0L), V2T4r1Broot = c(58001L,
21L, 247L, 2359L, 5L), R6T2r6Croot = c(79608L, 715L, 384L,
13121L, 0L), R3T2r1Aroot = c(7938L, 187L, 2305L, 212L, 0L
), R6T2r6Aroot = c(20243L, 1098L, 320L, 10632L, 9L), V2T1r5Broot = c(11102L,
156L, 200L, 8205L, 0L), V2T4r2Aroot = c(7641L, 393L, 53L,
366L, 27L), R6T4r5Croot = c(5L, 68L, 7192L, 4L, 0L), R6T4r5Broot = c(40122L,
92L, 29L, 64631L, 59L), R3T4r6Croot = c(49960L, 101L, 97L,
18846L, 0L), R6T2r2Aroot = c(81204L, 7801L, 1499L, 13245L,
6L), R3T4r5Aroot = c(108839L, 5072L, 1894L, 1957L, 55L),
R3T2r1Broot = c(624L, 0L, 62L, 3687L, 0L), R6T1r5Croot = c(67805L,
0L, 238L, 2L, 0L), V2T4r6Croot = c(43210L, 24L, 0L, 33L,
0L), R3T1r6Aroot = c(6419L, 274L, 1062L, 2411L, 79L), R3T2r2Croot = c(53908L,
34726L, 3497L, 82L, 144L), R3T4r2Aroot = c(10503L, 48L, 23L,
27764L, 0L), R3T2r5Aroot = c(2386L, 79L, 39L, 1805L, 0L),
V2T2r1Croot = c(20324L, 318L, 14L, 1192L, 0L), V2T1r5Aroot = c(3933L,
33L, 6L, 3785L, 0L), V2T1r1Broot = c(99803L, 7377L, 203L,
1098L, 75L), R3T4r6Aroot = c(16601L, 1113L, 217L, 309L, 33L
), R3T2r1Croot = c(135822L, 24308L, 4986L, 219L, 230L), V2T1r5Croot = c(12444L,
139L, 32L, 211L, 0L), R3T1r6Croot = c(3957L, 9L, 117L, 293426L,
0L), R6T1r2Broot = c(92870L, 43L, 625L, 616L, 0L), V2T2r2Aroot = c(26697L,
654L, 130L, 31056L, 0L), R3T2r6Broot = c(82471L, 22990L,
3253L, 955L, 4L), R6T1r1Aroot = c(11187L, 0L, 5L, 0L, 0L),
R6T1r6Broot = c(6016L, 72L, 386L, 3368L, 0L), R3T1r1Aroot = c(55133L,
5854L, 494L, 1694L, 45L), V2T1r2Aroot = c(9346L, 139L, 17L,
64L, 0L), R3T4r1Aroot = c(84510L, 4049L, 1441L, 1193L, 5L
), R6T2r5Aroot = c(38997L, 33L, 273L, 967L, 0L), R3T4r2Broot = c(54402L,
565L, 567L, 9L, 0L), R3T1r2Broot = c(42977L, 24L, 132L, 3L,
7L), R6T1r5Aroot = c(5433L, 39L, 16L, 2L, 0L), R3T1r1Croot = c(4356L,
0L, 0L, 24719L, 0L), R3T4r5Broot = c(39402L, 6424L, 151L,
0L, 0L), R6T1r2Aroot = c(67639L, 14L, 16L, 1L, 0L), R3T2r5Croot = c(12136L,
3420L, 193L, 98L, 0L), R3T1r5Croot = c(21358L, 2876L, 347L,
9850L, 0L), V2T1r6Broot = c(16975L, 2L, 273L, 1397L, 98L),
R6T1r1Croot = c(7403L, 18L, 36L, 2112L, 0L), R3T1r1Broot = c(18301L,
1122L, 276L, 6921L, 7L), V2T2r6Croot = c(59794L, 2560L, 92L,
12437L, 0L), R6T1r5Broot = c(58396L, 1284L, 119L, 21078L,
0L), V2T4r6Broot = c(10496L, 773L, 1603L, 1950L, 19L), V2T1r6Croot = c(34687L,
9560L, 38L, 19L, 1L), R3T4r1Broot = c(23020L, 78L, 276L,
160L, 66L), V2T4r1Aroot = c(32591L, 91L, 197L, 308L, 0L),
V2T4r6Aroot = c(35314L, 3L, 1550L, 1775L, 0L), V2T2r6Aroot = c(12424L,
884L, 149L, 481L, 0L), V2T2r1Aroot = c(6721L, 52L, 203L,
4286L, 0L), R3T2r2Broot = c(26251L, 447L, 326L, 4834L, 0L
), V2T4r2Broot = c(27830L, 2404L, 1131L, 98L, 18L), R6T1r6Aroot = c(11818L,
14L, 34L, 1L, 0L), V2T1r1Croot = c(7961L, 5L, 376L, 2802L,
0L), R6T2r2Croot = c(25329L, 15L, 63L, 76L, 0L), R6T2r2Broot = c(1002L,
0L, 153L, 26L, 0L), R6T2r1Aroot = c(38869L, 11953L, 1987L,
2639L, 0L), V2T4r5Aroot = c(1838L, 18L, 1L, 47981L, 5L),
R6T4r1Aroot = c(3323L, 16L, 3L, 7212L, 0L), V2T2r5Croot = c(22124L,
1037L, 395L, 1515L, 296L), R3T4r6Broot = c(4112L, 0L, 0L,
1L, 0L), R3T1r5Broot = c(4443L, 120L, 528L, 1176L, 0L), V2T2r6Broot = c(2068L,
55L, 11L, 7180L, 0L), R3T2r2Aroot = c(4962L, 277L, 35L, 1L,
7L), V2T1r1Aroot = c(18506L, 0L, 44L, 93L, 0L), R3T1r2Aroot = c(19779L,
2L, 162L, 51355L, 8L), R6T2r1Croot = c(913L, 4L, 26L, 10L,
0L), V2T2r5Broot = c(7309L, 69L, 63L, 38L, 15L), V2T4r1Croot = c(8043L,
231L, 1351L, 787L, 18L), R3T1r6Broot = c(1973L, 1L, 0L, 40482L,
0L), R3T4r1Croot = c(4004L, 326L, 12L, 2020L, 11L), V2T1r2Croot = c(2712L,
21L, 43L, 2127L, 0L), V2T4r2Croot = c(3711L, 118L, 0L, 1487L,
50L), R3T2r6Croot = c(1491L, 290L, 55L, 33L, 0L), R6T1r6Croot = c(8541L,
0L, 0L, 0L, 0L), R6T2r6Broot = c(2561L, 6L, 2L, 387L, 0L),
V2T2r1Broot = c(2128L, 315L, 180L, 1483L, 0L), V2T1r2Broot = c(2363L,
78L, 260L, 2182L, 0L), R3T2r6Aroot = c(486L, 0L, 191L, 1209L,
0L), R3T1r2Croot = c(6014L, 18L, 126L, 587L, 0L), NC1root = c(5L,
0L, 0L, 0L, 0L), R6T4r6Aroot = c(246L, 3L, 7L, 83L, 0L)), row.names = c(NA,
5L), class = "data.frame")
and
taxa <- dput(taxa[c(1,2,6,16,216),])
structure(list(OTU_ID = c("OTU_1", "OTU_2", "OTU_6", "OTU_16",
"OTU_216"), Kingdom = c("Fungi_1", "Fungi_1", "Fungi_1", "Fungi_1",
"Fungi_1"), Phylum = c("Ascomycota_1", "Ascomycota_1", "", "Ascomycota_1",
"Basidiomycota_1"), Class = c("Sordariomycetes_1", "Dothideomycetes_1",
"", "Dothideomycetes_1", "Tremellomycetes_1"), Order = c("Hypocreales_1",
"Pleosporales_1", "", "Pleosporales_1", "Tremellales_1"), Family = c("Nectriaceae_1",
"Corynesporascaceae_1", "", "Pleosporaceae_1", "Trimorphomycetaceae"
), Genus = c("Fusarium_1", "Corynespora_1", "", "Alternaria_1",
"Saitozyma"), Species = c("", "Corynespora cassiicola ", "",
"", "")), row.names = c(1L, 2L, 6L, 16L, 216L), class = "data.frame")
Here's some code I've tried:
OTUs1 <- as.matrix(OTUs)
taxa1 <- as.matrix(taxa)
OTUs2 = otu_table(OTUs1, taxa_are_rows = TRUE)
physeq <- phyloseq(OTUs2, taxa1)
I get this error when I run the code to try and make the object:
Error in phyloseq(OTUs2, taxa1) :
Problem with OTU/taxa indices among those you provided.
Check using intersect() and taxa_names()
I assume that you are getting this error because the row.names between both matrices do not match. The following steps should resolve your error:
require("dplyr")
# Add OTU_IDs as row.names
row.names(taxa) <- taxa[,'OTU_ID']
row.names(OTUs) <- OTUs[,'OTU_ID']
# Remove the OTU_ID column
taxa <- taxa[,colnames(taxa) != 'OTU_ID']
OTUs <- OTUs[,colnames(OTUs) != 'OTU_ID']
# Make sure that OTU_IDs in taxa and OTUs overlap and are in the same order
ids <- dplyr::intersect(row.names(taxa), row.names(OTUs))
taxa <- taxa[ids,]
OTUs <- OTUs[ids,]
# Convert to phyloseq object
taxa <- tax_table(as.matrix(taxa))
OTUs <- otu_table(as.matrix(OTUs), taxa_are_rows = TRUE)
physeq <- phyloseq(taxa, OTUs)

ANOVA error: why is each row of output *not* identified by a unique combination of keys?

I have a two-way ANOVA test (w/repeated measures) that I'm using with four almost identical datasets:
> res.aov <- anova_test(
+ data = LST_Weather_dataset_N, dv = LST, wid = Month,
+ within = c(Buffer, TimePeriod),
+ effect.size = "ges",
+ detailed = TRUE,
+ )
Where:
LST = surface temperature deviation in C
Month = 1-12
Buffer = a value 100-1900 - one of 19 areas outward from the boundary of a solar power plant (each 100m wide)
TimePeriod = a factor with a value of 1 or 2 corresponding to pre-/post-construction of a solar power plant.
For one dataset I get the error:
Error: Each row of output must be identified by a unique combination of keys.
Keys are shared for 38 rows:
* 10, 11
* 217, 218
* 240, 241
* 263, 264
* 286, 287
* 309, 310
* 332, 333
...
As far as I can tell I have unique combinations.
dplyr::count(LST_Weather_dataset_N, LST, Month, Buffer, TimePeriod, sort = TRUE)
returns
LST Month Buffer TimePeriod n
1 -6.309045316 12 100 2 1
2 -5.655279925 9 1000 2 1
3 -5.224196295 12 200 2 1
4 -5.194473224 9 1100 2 1
5 -5.025429891 12 400 2 1
6 -4.987575966 9 700 2 1
7 -4.979453868 12 600 2 1
8 -4.825298768 12 300 2 1
9 -4.668994574 12 500 2 1
10 -4.652282192 12 700 2 1
...
'n' is always 1.
I can't work out why this is happening.
Extract of datafram below:
> dput(LST_Weather_dataset_N[sample(1:nrow(LST_Weather_dataset_N), 50),])
structure(list(Buffer = c(1400L, 700L, 300L, 1400L, 100L, 200L,
1700L, 100L, 800L, 1900L, 1100L, 100L, 700L, 800L, 1400L, 400L,
1300L, 200L, 1200L, 500L, 1200L, 1300L, 400L, 1000L, 1300L, 1100L,
100L, 300L, 300L, 600L, 1100L, 1400L, 1500L, 1600L, 1700L, 1800L,
1700L, 1300L, 1200L, 300L, 1100L, 1900L, 1700L, 700L, 1400L,
1200L, 1600L, 1700L, 1900L, 1300L), Date = c("02/05/2014", "18/01/2017",
"19/06/2014", "25/12/2013", "15/09/2017", "08/04/2017", "22/08/2014",
"21/07/2014", "13/07/2017", "25/12/2013", "22/10/2013", "02/05/2014",
"07/03/2017", "15/03/2014", "13/07/2017", "19/06/2014", "25/12/2013",
"17/10/2017", "16/04/2014", "06/10/2013", "15/09/2017", "18/01/2017",
"10/01/2014", "17/12/2016", "13/07/2017", "19/06/2014", "07/03/2017",
"15/03/2014", "11/02/2014", "22/10/2013", "06/10/2013", "15/09/2017",
"16/04/2014", "18/01/2017", "15/03/2014", "21/07/2014", "17/10/2017",
"15/09/2017", "10/01/2014", "23/09/2014", "16/04/2014", "22/10/2013",
"11/06/2017", "26/05/2017", "19/06/2014", "14/08/2017", "11/02/2014",
"26/02/2017", "26/02/2017", "11/02/2014"), LST = c(1.255502397,
4.33385966, 3.327025603, -0.388631166, -0.865430798, 4.386292648,
-0.243018665, 3.276865987, 0.957036835, -0.065821795, 0.69731779,
4.846851651, -1.437700684, 1.003808572, 0.572460421, 2.995902374,
-0.334633662, -1.231447567, 0.644520741, 0.808262029, -3.392959991,
2.324569449, 2.346707612, -3.124354627, 0.58719862, 1.904859254,
1.701580958, 2.792443253, 1.638270039, 1.460743317, 0.699767335,
-3.015643366, 0.930527864, 1.309519336, 0.477789664, 0.147584938,
-0.498188865, -3.506795723, -1.007487965, 1.149604087, 1.192366386,
0.197471474, 0.999391224, -0.190613618, 1.27324015, 2.686622796,
0.573109026, 0.97847983, 0.395005095, -0.40855426), Month = c(5L,
1L, 6L, 12L, 9L, 4L, 8L, 7L, 7L, 12L, 10L, 5L, 3L, 3L, 7L, 6L,
12L, 10L, 4L, 10L, 9L, 1L, 1L, 12L, 7L, 6L, 3L, 3L, 2L, 10L,
10L, 9L, 4L, 1L, 3L, 7L, 10L, 9L, 1L, 9L, 4L, 10L, 6L, 5L, 6L,
8L, 2L, 2L, 2L, 2L), Year = c(2014L, 2017L, 2014L, 2013L, 2017L,
2017L, 2014L, 2014L, 2017L, 2013L, 2013L, 2014L, 2017L, 2014L,
2017L, 2014L, 2013L, 2017L, 2014L, 2013L, 2017L, 2017L, 2014L,
2016L, 2017L, 2014L, 2017L, 2014L, 2014L, 2013L, 2013L, 2017L,
2014L, 2017L, 2014L, 2014L, 2017L, 2017L, 2014L, 2014L, 2014L,
2013L, 2017L, 2017L, 2014L, 2017L, 2014L, 2017L, 2017L, 2014L
), JulianDay = c(122L, 18L, 170L, 359L, 258L, 98L, 234L, 202L,
194L, 359L, 295L, 122L, 66L, 74L, 194L, 170L, 359L, 290L, 106L,
279L, 258L, 18L, 10L, 352L, 194L, 170L, 66L, 74L, 42L, 295L,
279L, 258L, 106L, 18L, 74L, 202L, 290L, 258L, 10L, 266L, 106L,
295L, 162L, 146L, 170L, 226L, 42L, 57L, 57L, 42L), TimePeriod = c(1L,
2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L,
2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L,
2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L,
1L), Temperature = c(28L, 9L, 31L, 12L, 27L, 21L, 29L, 36L, 38L,
12L, 23L, 28L, 12L, 21L, 38L, 31L, 12L, 23L, 25L, 22L, 27L, 9L,
11L, 7L, 38L, 31L, 12L, 21L, 14L, 23L, 22L, 27L, 25L, 9L, 21L,
36L, 23L, 27L, 11L, 31L, 25L, 23L, 29L, 27L, 31L, 34L, 14L, 16L,
16L, 14L), Humidity = c(6L, 34L, 7L, 31L, 29L, 22L, 34L, 15L,
19L, 31L, 16L, 6L, 14L, 14L, 19L, 7L, 31L, 12L, 9L, 12L, 29L,
34L, 33L, 18L, 19L, 7L, 14L, 14L, 31L, 16L, 12L, 29L, 9L, 34L,
14L, 15L, 12L, 29L, 33L, 18L, 9L, 16L, 8L, 13L, 7L, 13L, 31L,
31L, 31L, 31L), Wind_speed = c(6L, 0L, 6L, 7L, 13L, 33L, 6L,
20L, 9L, 7L, 0L, 6L, 0L, 6L, 9L, 6L, 7L, 6L, 0L, 7L, 13L, 0L,
0L, 35L, 9L, 6L, 0L, 6L, 6L, 0L, 7L, 13L, 0L, 0L, 6L, 20L, 6L,
13L, 0L, 0L, 0L, 0L, 24L, 11L, 6L, 24L, 6L, 26L, 26L, 6L), Wind_gust = c(0L,
0L, 0L, 0L, 0L, 54L, 0L, 46L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 48L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 46L, 0L, 0L, 0L, 0L, 0L, 0L, 48L, 0L, 0L, 39L,
0L, 41L, 41L, 0L), Wind_trend = c(1L, 0L, 1L, 1L, 2L, 2L, 0L,
1L, 2L, 1L, 0L, 1L, 0L, 1L, 2L, 1L, 1L, 0L, 0L, 2L, 2L, 0L, 1L,
1L, 2L, 1L, 0L, 1L, 1L, 0L, 2L, 2L, 0L, 0L, 1L, 1L, 0L, 2L, 1L,
1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Wind_direction = c(0,
0, 0, 337.5, 360, 22.5, 0, 22.5, 0, 337.5, 0, 0, 0, 0, 0, 0,
337.5, 180, 0, 247.5, 360, 0, 0, 180, 0, 0, 0, 0, 337.5, 0, 247.5,
360, 0, 0, 0, 22.5, 180, 360, 0, 0, 0, 0, 360, 22.5, 0, 360,
337.5, 360, 360, 337.5), Pressure = c(940.2, 943.64, 937.69,
951.37, 932.69, 933.94, 937.07, 938.01, 937.69, 951.37, 939.72,
940.2, 948.33, 947.71, 937.69, 937.69, 951.37, 943.32, 932.69,
944.71, 932.69, 943.64, 942.31, 943.01, 937.69, 937.69, 948.33,
947.71, 941.94, 939.72, 944.71, 932.69, 932.69, 943.64, 947.71,
938.01, 943.32, 932.69, 942.31, 938.94, 932.69, 939.72, 928.31,
931.12, 937.69, 932.37, 941.94, 936.13, 936.13, 941.94), Pressure_trend = c(1L,
2L, 0L, 2L, 0L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 0L, 2L,
1L, 2L, 1L, 0L, 2L, 2L, 2L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 2L,
2L, 1L, 1L, 1L, 0L, 2L, 1L, 2L, 1L, 0L, 0L, 0L, 1L, 1L, 2L, 2L,
1L)), row.names = c(179L, 14L, 195L, 426L, 306L, 118L, 299L,
229L, 244L, 436L, 374L, 153L, 90L, 91L, 256L, 197L, 424L, 348L,
137L, 355L, 328L, 26L, 7L, 419L, 254L, 211L, 78L, 81L, 43L, 359L,
373L, 332L, 143L, 32L, 109L, 263L, 393L, 330L, 23L, 309L, 135L,
398L, 224L, 166L, 217L, 290L, 69L, 72L, 76L, 63L), class = "data.frame")
Well, this is a bit embarrassing.
The error arose as there were not, in fact, paired months of the data. Rather than there being 38 data (19x2) for each month, due to an error in determining the month value one month had 57 data (19x3). Correcting this, and checking that each month had the same number of paired data for the ANOVA allowed the test to run sucessfully.
> res.aov <- anova_test(
+ data = LST_Weather_dataset_N, dv = LST, wid = Month,
+ within = c(Buffer, TimePeriod),
+ effect.size = "ges",
+ detailed = TRUE,
+ )
> get_anova_table(res.aov, correction = "auto")
ANOVA Table (type III tests)
Effect DFn DFd SSn SSd F p p<.05 ges
1 (Intercept) 1 11 600.135 974.584 6.774 2.50e-02 * 0.189
2 Buffer 18 198 332.217 331.750 11.015 2.05e-21 * 0.115
3 TimePeriod 1 11 29.561 977.945 0.333 5.76e-01 0.011
4 Buffer:TimePeriod 18 198 13.055 283.797 0.506 9.53e-01 0.005
I still don't understand how the error message was telling me this, though.

Widening Data and Changing Columns

I have managed to delete a little bit of code that did the below task and can't for the life of me figure out how I did it before.
I want to widen the data that has two factors spread over 8 different 'waves'. There are four 'Paper' factors, each with the same four internal factors 'Response'. The output from a previously required function gives the following dataframe:
[
And I would like to make it look like this:
The single column of the first tibble has become the single row of the second tibble.
As you can see, the second tibble has extra factors of Paper but these can just be joined row wise.
I really wasn't sure how to attack this, but thought it would be done using the pivot_wider function. When I tried
times_correct <- times_19 %>%
pivot_wider( id_cols = c('Stay/remain in the EU`', 'Leave the EU', 'I would/will not vote', 'Don\'t know'), names_from = eurrefcolnames)
I got the error that I can't subset columns that don't exist which makes sense: I need to manually add the correct 'Waves'. I think this is relatively simple, but can't for the life of me figure out how I did it!
Here is the dput of the various tibbles:
structure(list(resp = structure(c(3L, 2L, 4L, 1L, NA, NA, NA,
NA), .Label = c("Don't Know", "Leave", "Remain", "Will Not Vote"
), class = "factor"), `Stay/remain in the EU` = c(316L, 290L,
313L, 324L, 338L, 320L, 325L, 335L), `Leave the EU` = c(157L,
123L, 159L, 154L, 134L, 189L, 187L, 181L), `I would/will not vote` = c(2L,
3L, 3L, 3L, 2L, 2L, 2L, 0L), `Don't know` = c(56L, 51L, 55L,
50L, 57L, 20L, 17L, 0L), Paper = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = "Times", class = "factor")), row.names = c(NA,
-8L), class = c("tbl_df", "tbl", "data.frame"))
structure(list(resp = structure(c(3L, 2L, 4L, 1L, 3L, 2L, 4L,
1L, 3L, 2L, 4L, 1L, 3L, 2L, 4L, 1L, 3L, 2L, 4L, 1L), .Label = c("Don't Know",
"Leave", "Remain", "Will Not Vote"), class = "factor"), euRefVoteW1 = c(316L,
157L, 2L, 56L, 190L, 339L, 4L, 70L, 819L, 79L, 9L, 71L, 1294L,
1311L, 150L, 523L, 1715L, 2587L, 133L, 630L), euRefVoteW2 = c(290L,
123L, 3L, 51L, 175L, 282L, 3L, 62L, 777L, 74L, 5L, 62L, 1091L,
925L, 80L, 371L, 1528L, 2044L, 83L, 517L), euRefVoteW3 = c(313L,
159L, 3L, 55L, 199L, 334L, 4L, 69L, 835L, 81L, 10L, 57L, 1348L,
1289L, 139L, 508L, 1766L, 2563L, 156L, 586L), euRefVoteW4 = c(324L,
154L, 3L, 50L, 215L, 328L, 2L, 61L, 848L, 70L, 10L, 55L, 1397L,
1267L, 128L, 492L, 1853L, 2494L, 143L, 583L), euRefVoteW6 = c(338L,
134L, 2L, 57L, 241L, 286L, 2L, 77L, 853L, 68L, 5L, 57L, 1519L,
1133L, 112L, 520L, 2017L, 2284L, 106L, 667L), euRefVoteW7 = c(320L,
189L, 2L, 20L, 186L, 384L, 2L, 34L, 832L, 109L, 8L, 34L, 1449L,
1456L, 87L, 292L, 1906L, 2785L, 55L, 328L), euRefVoteW8 = c(325L,
187L, 2L, 17L, 187L, 384L, 1L, 34L, 836L, 118L, 5L, 24L, 1462L,
1522L, 72L, 228L, 1898L, 2852L, 56L, 268L), euRefVoteW9 = c(335L,
181L, 0L, 0L, 206L, 385L, 0L, 6L, 844L, 102L, 0L, 4L, 1572L,
1462L, 0L, 21L, 2018L, 2827L, 0L, 20L), Paper = structure(c(1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L,
5L, 5L, 5L), .Label = c("Times", "Telegraph", "Control", "No_Paper",
"Rest"), class = "factor")), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))
eurrefcolnames = c('euRefVoteW1','euRefVoteW2', 'euRefVoteW3', 'euRefVoteW4', 'euRefVoteW6',' euRefVoteW7', 'euRefVoteW8', 'euRefVoteW9')
EDIT:
Here is the function that create the initial dataframes, is there an edit I could make here perhaps ?
tally_reader_number <- function(input_dataframe,newspaper_name) {
#function takes the input of in_all_waves, tallies the number of different eu ref responses using map_df for a given newspaper factor (defined above)
# and returns a dataframe of responese for each wave with the newspaper factor as a column
returned_dataframe <- input_dataframe %>%
filter(Paper == newspaper_name) %>%
ungroup() %>% #function refuses to work without this
select(-Paper) %>%
map_df(table) %>% # use map_df from the purrr package to "table" each column
rownames_to_column("response") %>% #convert the rownames to a column named response
mutate(resp = case_when(response == 1 ~ "Remain", #change the resulting numbers to the correct responses
response == 2 ~ "Leave",
response ==3 ~ "Will Not Vote",
response == 4 ~ "Don't Know")) %>%
select(resp, everything(), -response) %>% #reorder the columns with resp at the front, removing response
mutate(Paper = newspaper_name)
returned_dataframe$Paper <- as.factor(returned_dataframe$Paper)
returned_dataframe$resp <- as.factor(returned_dataframe$resp)
returned_dataframe
}

Support Vector Machine Visualization in R

I am having trouble graphing my SVM model in R. The formula is:
svm_linear <- svm(open ~ review_count + recession + duration + count + stars + Freq + avgRev + avgStar, data=yelp_train, cost=100, gamma=1)
plot(svm_linear, data=yelp_train)
I can't figure out why nothing appears after running the plot function. Please help.
I added the dput out.
I cut out some of the extra columns to avoid waste.
newdata <- cleanDataFrame[2:10]
set.seed(10)
(newdata[sample(1:nrow(newdata), 30),])
structure(list(open = c(1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L,
1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L,
1L, 1L, 1L, 0L), review_count = c(3L, 5L, 6L, 38L, 6L, 4L, 5L,
23L, 19L, 3L, 22L, 74L, 15L, 38L, 88L, 26L, 9L, 3L, 58L, 4L,
13L, 117L, 38L, 10L, 5L, 6L, 102L, 108L, 264L, 103L), stars = c(3,
4, 4.5, 4, 3, 3, 3, 4, 3.5, 3.5, 3.5, 4.5, 4.5, 4, 2.5, 3.5,
3.5, 3.5, 4, 3, 4.5, 4.5, 4, 3.5, 4, 3.5, 4, 3, 3.5, 4), Freq = c(166L,
12L, 166L, 15L, 45L, 166L, 66L, 79L, 33L, 58L, 150L, 389L, 150L,
1L, 389L, 20L, 389L, 389L, 389L, 166L, 74L, 0L, 389L, 32L, 389L,
161L, 126L, 389L, 98L, 3L), avgRev = c(23.7904191616766, 18.7692307692308,
23.7904191616766, 98, 78.804347826087, 23.7904191616766, 31.3283582089552,
64.3375, 23.1764705882353, 23.6949152542373, 60.6490066225166,
34.1923076923077, 60.6490066225166, 22, 34.1923076923077, 33.1904761904762,
34.1923076923077, 34.1923076923077, 34.1923076923077, 30.8443113772455,
27.6533333333333, 117, 34.1923076923077, 30.4545454545455, 34.1923076923077,
37.2716049382716, 47.3149606299213, 34.1923076923077, 64.3838383838384,
73.75), avgStar = c(3.53592814371257, 3.92307692307692, 3.53592814371257,
3.96875, 3.6195652173913, 3.53592814371257, 3.69402985074627,
3.58125, 3.5, 3.67796610169492, 3.63245033112583, 3.5551282051282,
3.63245033112583, 4, 3.5551282051282, 3.78571428571429, 3.5551282051282,
3.5551282051282, 3.5551282051282, 3.48203592814371, 3.72666666666667,
4.5, 3.5551282051282, 3.65151515151515, 3.5551282051282, 3.43827160493827,
3.63385826771654, 3.5551282051282, 3.60606060606061, 4.25), count = c(4L,
2L, 5L, 5L, 0L, 2L, 5L, 0L, 2L, 8L, 3L, 15L, 4L, 3L, 15L, 14L,
1L, 1L, 0L, 1L, 2L, 0L, 0L, 50L, 1L, 27L, 4L, 51L, 36L, 14L),
recession = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), duration = c(332L, 427L, 614L, 117L, 1894L,
1346L, 140L, 1909L, 1100L, 1030L, 1666L, 2096L, 1054L, 352L,
2145L, 1018L, 1763L, 391L, 2116L, 1567L, 693L, 674L, 1626L,
301L, 295L, 378L, 649L, 376L, 1028L, 2390L)), .Names = c("open",
"review_count", "stars", "Freq", "avgRev", "avgStar", "count",
"recession", "duration"), row.names = c(1439L, 870L, 1210L, 1962L,
242L, 639L, 777L, 771L, 1741L, 1214L, 1840L, 1603L, 322L, 1681L,
1010L, 1209L, 148L, 745L, 1124L, 2354L, 2433L, 1731L, 2180L,
1000L, 1141L, 1985L, 2814L, 674L, 2163L, 999L), class = "data.frame")
It looks like you're trying to do classification, but your outcome variable is integer mode. To see this, do str(yelp_train). Turn the outcome into a factor and then try your plot again. For example:
yelp_train$openF = factor(yelp_train$open)
svm_linear <- svm(openF ~ review_count + recession + duration + count + stars + Freq + avgRev +
avgStar, data=yelp_train, cost=100, gamma=1)
plot(svm_linear, formula = review_count ~ Freq, data=yelp_train)
One other thing. In the portion of the data you provided, recession is always zero. If this is the case with all of the data, then remove recession from your call to svm. I had to do this to avoid an error. Once I removed recession, I was able to run the model and plot several combinations of variables successfully.
Question in Comments: Why isn't Open the dependent variable in the formula in the plot function? You're plotting where the decision boundary lies in relation to the values of two of the independent variables (or "features" in machine learning lingo). The predicted value of the dependent variable, Open, is given by the fill colors: In this case, one color for Open=1 and another for Open=0. The boundary between the two colors is the decision boundary that the svm model came up with. The plot also includes points representing the pairs of values of the two features used for the plot. The two different plot markers represent the two different values of Open and you can see how many points were properly classified and how many were misclassified by your model.
The full decision boundary is a hyperplane in a multi-dimensional space. For example, if you had 3 features in the model, the features would lie in a 3-dimensional space (imagine a 3D scatterplot) and the decision boundary would be a 2-dimensional hyperplane through that 3D space (which we of course refer to as a "plane" in this case; and in general, the decision boundary has dimension one less than the dimension of the feature space).
When you plot two features, you're looking at a two-dimensional slice through that multi-dimensional space. The plot function is setting the values of the other features to some specific values--maybe the mean for numeric variables and the base factor level for factor variables--check the documentation to be sure. The plot function for svm models allows you to set the specific values of the other features (besides the two you're plotting) using the slice argument. That allows you to see how the decision boundary for two particular features varies based on changes in the values of other features.
You might find the svm chapter of Introduction to Statistical Learning useful for additional info (you can download it at no charge).

Resources