t.test outputs in the `table` package in R - r

So here's a sample of the data I am working with:
> dput(candidateEvokeDFYoung)
structure(list(youngTreatment = structure(c(NA, 1, 0, 1, 0, 1,
0, 1, 1, 0, 1, 1, 0, 0, NA, NA, NA, NA, 1, 1), format.stata = "%10.0g"),
candTrustworthy = structure(c(0, 0, 0, 0, 0, 0, 0, 1, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), format.stata = "%10.0g"),
candKnowledgeable = structure(c(1, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1), format.stata = "%10.0g"),
candQualified = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1), format.stata = "%10.0g"),
candConservative = structure(c(0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0), format.stata = "%10.0g"),
candLiberal = structure(c(0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), format.stata = "%10.0g"), candInexperienced = structure(c(0,
1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0), format.stata = "%10.0g"),
candPrincipled = structure(c(1, 1, 0, 0, 0, 0, 1, 0, 0, 0,
0, 0, 0, 1, 1, 1, 0, 0, 0, 0), format.stata = "%10.0g"),
candDistance = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0), format.stata = "%10.0g"),
candEfficacy = structure(c(1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 1, 0, 0), format.stata = "%10.0g")), row.names = c(NA,
-20L), class = c("tbl_df", "tbl", "data.frame"))
What I am trying to do is generate a table using the tables package with results from a t.test. The trouble I am having is I've taken this dataset and have used lapply to calculate my t.tests on each of the variables with youngTreatment as my 'y' variable:
candidateEvokesDiffYoung = lapply(candidateEvokeDFYoung[-1], function(x) t.test(x ~ candidateEvokeDFYoung$youngTreatment))
This gives me a list of lists. I have no clue how to use tables::tabular to access
list[['statistic']]
and
list[['p.value]]
I could definitely just manually pull all of these out myself and put it in a dataframe for stargazer or something, but I was wondering if there was someone who knew how I could do this more efficiently and with the tables package.

t.test returns objects of the class htest. I believe the best way to gather the results of an object of the class htest is to use the function tidy of the package broom.
library(broom)
candidateEvokesDiffYoung = lapply(candidateEvokeDFYoung[-1],
function(x) {
t.test(x ~ candidateEvokeDFYoung$youngTreatment)
})
m <- t(sapply(candidateEvokesDiffYoung, tidy))
This will allow you to refer to the elements in a similar way to what you seem to be trying to.
> m["candTrustworthy", "p.value"][[1]]
[1] 0.7875872
> unlist(m[, "p.value"])
candTrustworthy candKnowledgeable candQualified candConservative candLiberal candInexperienced candPrincipled candDistance candEfficacy
0.7875872 0.7875872 0.7875872 0.3632175 0.3465935 0.6933006 0.3790778 NaN 0.3632175

Related

Why am I getting this error when using glmnet in R

When trying to predict I am getting this error
error in evaluating the argument 'x' in selecting a method for function 'as.matrix': Cholmod error 'X and/or Y have wrong dimensions' at file ../MatrixOps/cholmod_sdmult.c, line 90
Here is my code so far
library(data.table)
library(caret)
library(Metrics)
library(glmnet)
library(plotmo)
library(lubridate)
#Reading in the necessary data needed for this project
train <- fread("project/volume/data/processed/start_train.csv")
test<-fread("project/volume/data/processed/start_test.csv")
example_sub<-fread("project/volume/data/processed/example_submission.csv")
card_tab <- fread("project/volume/data/processed/card_tab.csv")
#Merging the card_tab dataset with both my train and test datasets to add more variables to each
train = merge(train, card_tab, by = "id")
test = merge(test, card_tab, by = "id")
train$power = as.numeric(train$power)
train$toughness = as.numeric(train$toughness)
test$power = as.numeric(test$power)
test$toughness = as.numeric(test$toughness)
train$powerovertough = train$power/train$toughness
test$powerovertough = test$power/test$toughness
train$current_date<-as_date(train$current_date)
train<-train[order(-current_date)]
test$current_date<-as_date(test$current_date)
test<-test[order(-current_date)]
#Handling NA values in both train and test. The NA values will be set to 0
train[is.na(train)] <- 0
test[is.na(test)] <- 0
# Specifying which columns of our model we will be dropping and applying it to our train and test datasets
drops<- c('id','future_date','current_date','card_name','power','loyalty','cmc','type','colors','mana_cost','subtypes', 'text','set','set_name')
train<-train[, !drops, with = FALSE]
test<-test[, !drops, with = FALSE]
#Saving the response variable in the train dataset
train_y<-train$future_price
test$future_price<-0
#Using dummies to train model
dummies <- dummyVars(future_price ~ ., data = train)
train<-predict(dummies, newdata = train)
test<-predict(dummies, newdata = test)
train<-data.table(train)
test<-data.table(test)
#Cross validating the model to fin the best lamda value
train<-as.matrix(train)
test<-as.matrix(test)
gl_model<-cv.glmnet(train, train_y, alpha = 1,family="gaussian")
bestlam<-gl_model$lambda.min
# Fitting the full model
gl_model<-glmnet(train, train_y, alpha = 1,family="gaussian")
plot_glmnet(gl_model)
saveRDS(gl_model,"./project/volume/models/gl_model.model")
test<-as.matrix(test)
#use the full model
pred<-predict(gl_model,s=bestlam, newx = test)
I am trying to predict future_price for my test set. The error is saying my dimensions are wrong but I don't know what could be causing them to be different. I have tried observing the data sets as it runs through the code and they seem to have the same variables.
Here is the dput
> dput(head(train))
structure(c(0.25, 0.1, 0.1, 0.1, 0.25, 0.25, 1, 0, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 3, 0, 2, 0, 0, 0, 0.333333333333333,
0, 1, 0), .Dim = c(6L, 20L), .Dimnames = list(NULL, c("current_price",
"typesArtifact", "typesArtifact Creature", "typesCreature", "typesEnchantment",
"typesEnchantment Artifact", "typesEnchantment Creature", "typesInstant",
"typesLand", "typesPlaneswalker", "typesSorcery", "supertypes",
"supertypesBasic", "supertypesLegendary", "rarityCommon", "rarityMythic",
"rarityRare", "rarityUncommon", "toughness", "powerovertough"
)))
> dput(head(test))
structure(c(0.15, 0.16, 2, 0.39, 0.16, 0.19, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0), .Dim = c(6L, 18L), .Dimnames = list(
NULL, c("current_price", "typesArtifact", "typesArtifact Creature",
"typesCreature", "typesEnchantment", "typesInstant", "typesLand",
"typesPlaneswalker", "typesSorcery", "supertypes", "supertypesBasic",
"supertypesLegendary", "rarityCommon", "rarityMythic", "rarityRare",
"rarityUncommon", "toughness", "powerovertough")))

lapply Question: How to streamline further without generating errors

I'm looking to condense the steps in my script, but I'm having issues with lapply(). It looks to be an issue with my code as usual. Any help would be much appreciated!
library(iNEXT)
sa4 <- list(Bird = list(structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1,
0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0,
0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0), .Dim = c(26L,
6L), .Dimnames = list(Scientific_name = c(" Pycnonotus plumosus",
"Acridotheres javanicus", "Aegithina tiphia", "Aethopyga siparaja",
"Anthreptes malacensis", "Aplonis panayensis", "Cacatua goffiniana",
"Callosciurus notatus", "Cinnyris jugularis", "Copsychus malabaricus",
"Copsychus saularis", "Dicaeum cruentatum", "Dicrurus paradiseus",
"Gorsachius melanolophus", "Larvivora cyane", "Macronus gularis",
"Oriolus chinensis", "Orthotomus atrogularis", "Otus lempiji",
"Pitta moluccensis", "Pycnonotus goiavier", "Pycnonotus plumosus",
"Pycnonotus zeylanicus", "Spilopelia chinensis", "Todiramphus chloris",
"Zosterops simplex"), Sampling_Point = c("SA_01", "SA_02", "SA_03",
"SA_04", "SA_05", "SA_06")))), Butterfly = list(structure(c(0,
0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0,
0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0), .Dim = c(10L,
4L), .Dimnames = list(Scientific_name = c("Burara harisa consobrina",
"Catopsilia pyranthe pyranthe", "Catopsilia scylla cornelia",
"Delias hyparete metarete", "Eurema sp", "Idea leuconoe clara",
"Pachliopta aristolochiae asteris", "Phalanta phalantha phalantha",
"Troides helena cerberus", "Zizula hylax pygmaea"), Sampling_Point = c("SA_01",
"SA_02", "SA_04", "SA_06")))), Mammal = list(structure(c(0, 1,
1, 1, 1, 0), .Dim = 2:3, .Dimnames = list(Scientific_name = c("Callosciurus notatus",
"Unidentified Fruit Bat sp"), Sampling_Point = c("SA_03", "SA_04",
"SA_05")))), Reptile = list(structure(1, .Dim = c(1L, 1L), .Dimnames = list(
Scientific_name = "Hemidactylus frenatus", Sampling_Point = "SA_05"))))
I've been doing it the longer way:
estimateD(sa4$Butterfly, datatype="incidence_raw") #Sampling coverage for butterflies
estimateD(sa4$Mammal, datatype="incidence_raw") #Sampling coverage for mammals
estimateD(sa4$Bird, datatype="incidence_raw") #Sampling coverage for birds
estimateD(sa4$Reptile, datatype="incidence_raw") #Sampling coverage for reptiles
Note that estimateD(sa4$Reptile, datatype="incidence_raw" generates an error since it only has one species.
Is it possible to condense the following steps via lapply? In this situation I've only have 4 taxa, but for other projects, it might be a lot more. I tried the following and it gives me a warning message--which actually is the same warning message as the one above. I'm wondering if lapply stops working if one component gives an error?
> (lapply(sa4, function(x) estimateD(x, datatype="incidence_raw")) )
Error in `[.data.frame`(tmp, , c(1, 2, 3, 7, 4, 5, 6)) :
undefined columns selected
In addition: Warning messages:
1: In FUN(X[[i]], ...) :
Invalid data type, the element of species by sites presence-absence matrix should be 0 or 1. Set nonzero elements as 1.
2: In log(b/Ub) : NaNs produced
Please let me know if I need to provide more information? Thank you!
This is a simple error trapping issue. Wrap tryCatcharound your problem function call and have the error function return information on what happened.
results <- lapply(sa4, function(x) {
tryCatch(estimateD(x, datatype="incidence_raw"),
error = function(e) e)
})
Now determine which ran alright.
ok <- !sapply(results, inherits, "error")
ok
# Bird Butterfly Mammal Reptile
# TRUE TRUE TRUE FALSE
And check those that did.
results[ok]
It is the issue with the 'Reptiles', so if we select the first 3 elements of the list, it should work
lapply(sa4[1:3], function(x) estimateD(x, datatype="incidence_raw"))

xgb.cv with no folds and return the results based on a split of the data

I have some data which looks like:
# A tibble: 50 x 28
sanchinarro date holiday weekday weekend workday_on_holi… weekend_on_holi… protocol_active
<dbl> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 -1.01 2010-01-01 1 1 0 1 0 0
2 0.832 2010-01-02 0 0 1 0 0 0
3 1.29 2010-01-03 0 0 1 0 0 0
4 1.04 2010-01-04 0 1 0 0 0 0
5 0.526 2010-01-05 0 1 0 0 0 0
6 -0.292 2010-01-06 1 1 0 1 0 0
7 -0.394 2010-01-07 0 1 0 0 0 0
8 -0.547 2010-01-08 0 1 0 0 0 0
9 -0.139 2010-01-09 0 0 1 0 0 0
10 0.628 2010-01-10 0 0 1 0 0 0
I want to run xgb.cv on the first 40 rows and validate it on the final 10 rows.
I try the following:
library(xgboost)
library(dplyr)
X_Val <- ddd %>% select(-c(1:2))
Y_Val <- ddd %>% select(c(1)) %>% pull()
dVal <- xgb.DMatrix(data = as.matrix(X_Val), label = as.numeric(Y_Val))
xgb.cv(data = dVal, nround = 30, folds = NA, params = list(eta = 0.1, max_depth = 5))
which gives me this error:
Error in xgb.cv(data = dVal, nround = 30, folds = NA, eta = 0.1,
max_depth = 5) : 'folds' must be a list with 2 or more elements
that are vectors of indices for each CV-fold
How can I run a simple xgb.cv on the first 40 rows and test it on the last 10 rows.
I eventually want to apply a gird search with a list of parameters and save the results in a list. Since I am dealing with time series data I do not want to mix the folds up, I just want a simple train and in-sample test of 40:10.
Data:
ddd <- structure(list(sanchinarro = c(-1.00742964973274, 0.832453587904369,
1.29242439731365, 1.03688505875294, 0.525806381631517, -0.291919501762755,
-0.394135237187039, -0.547458840323464, -0.138595898626329, 0.628022117055801,
1.19020866188936, 1.5990716035865, 1.5990716035865, -0.70078244345989,
2.11015028070792, 1.95682667757149, 0.985777191040795, 0.883561455616511,
0.985777191040795, 0.270267043070807, 2.51901322240505, 2.41679748698077,
0.372482778495091, -0.291919501762755, -0.905213914308458, -0.905213914308458,
-0.649674575747748, 1.2413165296015, 1.54796373587436, -0.70078244345989,
-0.905213914308458, -0.0363801632020448, 1.54796373587436, 2.00793454528363,
1.54796373587436, -0.445243104899181, -0.445243104899181, 1.03688505875294,
0.628022117055801, -0.496350972611323, 0.168051307646523, -0.649674575747748,
0.0658355722222391, -1.00742964973274, -0.291919501762755, 0.0147277045100972,
0.168051307646523, -0.189703766338471, 0.219159175358665, 0.679129984767943
), date = structure(c(14610, 14611, 14612, 14613, 14614, 14615,
14616, 14617, 14618, 14619, 14620, 14621, 14622, 14623, 14624,
14625, 14626, 14627, 14628, 14629, 14630, 14631, 14632, 14633,
14634, 14635, 14636, 14637, 14638, 14639, 14640, 14641, 14642,
14643, 14644, 14645, 14646, 14647, 14648, 14649, 14650, 14651,
14652, 14653, 14654, 14655, 14656, 14657, 14658, 14659), class = "Date"),
holiday = c(1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), weekday = c(1,
0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,
1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1), weekend = c(0, 1, 1, 0,
0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
0, 1, 1, 0, 0, 0, 0, 0), workday_on_holiday = c(1, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), weekend_on_holiday = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), protocol_active = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), text_broken_clouds = c(0,
1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1), text_clear = c(0, 0, 0,
0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1,
0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 1), text_fog = c(0, 1, 0, 1, 1, 0,
0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 1, 0, 1, 0), text_partly_cloudy = c(0, 1, 0, 0, 0,
1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), text_partly_sunny = c(1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0,
0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
0, 0, 0, 0, 1, 1, 1), text_passing_clouds = c(1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1), text_scattered_clouds = c(1, 1,
0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1), text_sunny = c(0, 0, 0, 0,
0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1), month_1 = c(1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0), month_2 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1), month_3 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), month_4 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), month_5 = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), month_6 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0), month_7 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0), month_8 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), month_9 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), month_10 = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), month_11 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0), month_12 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-50L))
EDIT: List data:
The final data comes in the form of lists.
datalst <- list(structure(list(sanchinarro = c(-1.00742964973274, 0.832453587904369,
1.29242439731365, 1.03688505875294, 0.525806381631517, -0.291919501762755,
-0.394135237187039, -0.547458840323464, -0.138595898626329, 0.628022117055801,
1.19020866188936, 1.5990716035865, 1.5990716035865, -0.70078244345989
), date = structure(c(14610, 14611, 14612, 14613, 14614, 14615,
14616, 14617, 14618, 14619, 14620, 14621, 14622, 14623), class = "Date"),
holiday = c(1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0), weekday = c(1,
0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1), weekend = c(0, 1,
1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0), workday_on_holiday = c(1,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0), weekend_on_holiday = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), protocol_active = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), text_broken_clouds = c(0,
1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), text_clear = c(0,
0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0), text_fog = c(0, 1,
0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0), text_partly_cloudy = c(0,
1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0), text_partly_sunny = c(1,
1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1), text_passing_clouds = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), text_scattered_clouds = c(1,
1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1), text_sunny = c(0,
0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0), month_1 = c(1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), month_2 = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), month_3 = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), month_4 = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), month_5 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), month_6 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), month_7 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), month_8 = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0), month_9 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0), month_10 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0), month_11 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0), month_12 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-14L)), structure(list(sanchinarro = c(0.832179838392013, 1.29225734336885,
1.03665872949283, 0.525461501740789, -0.292454062662475, -0.394693508212883,
-0.548052676538495, -0.139094894336863, 0.627700947291197, 1.19001789781844,
1.59897568002007, 1.59897568002007, -0.701411844864107, 2.11017290777211
), date = structure(c(14611, 14612, 14613, 14614, 14615, 14616,
14617, 14618, 14619, 14620, 14621, 14622, 14623, 14624), class = "Date"),
holiday = c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), weekday = c(0,
0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1), weekend = c(1, 1,
0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0), workday_on_holiday = c(0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), weekend_on_holiday = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), protocol_active = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), text_broken_clouds = c(1,
0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), text_clear = c(0,
0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1), text_fog = c(1, 0,
1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0), text_partly_cloudy = c(1,
0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0), text_partly_sunny = c(1,
1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0), text_passing_clouds = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), text_scattered_clouds = c(1,
0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0), text_sunny = c(0,
0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1), month_1 = c(1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), month_2 = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), month_3 = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), month_4 = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), month_5 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), month_6 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), month_7 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), month_8 = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0), month_9 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0), month_10 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0), month_11 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0), month_12 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-14L)), structure(list(sanchinarro = c(1.29293502084952, 1.03729933727253,
0.526027970118536, -0.292006217327851, -0.394260490758649, -0.547641900904846,
-0.138624807181653, 0.628282243549334, 1.19068074741873, 1.59969784114192,
1.59969784114192, -0.701023311051044, 2.11096920829591, 1.95758779814971
), date = structure(c(14612, 14613, 14614, 14615, 14616, 14617,
14618, 14619, 14620, 14621, 14622, 14623, 14624, 14625), class = "Date"),
holiday = c(0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), weekday = c(0,
1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0), weekend = c(1, 0,
0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1), workday_on_holiday = c(0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), weekend_on_holiday = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), protocol_active = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), text_broken_clouds = c(0,
1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1), text_clear = c(0,
0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0), text_fog = c(0, 1,
1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0), text_partly_cloudy = c(0,
0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0), text_partly_sunny = c(1,
1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1), text_passing_clouds = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), text_scattered_clouds = c(0,
0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0), text_sunny = c(0,
0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0), month_1 = c(1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), month_2 = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), month_3 = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), month_4 = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), month_5 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), month_6 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), month_7 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), month_8 = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0), month_9 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0), month_10 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0), month_11 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0), month_12 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-14L)))
EDIT:
I think this gives me what I am after - I need to double/tripple check it. (if you see any errors please let me know)
splt <- 0.80 * nrow(ddd)
ddd[c(1:splt), "id"] = 1
ddd$id[is.na(ddd$id)] = 2
fold.ids <- unique(ddd$id)
custom.folds <- vector("list", length(fold.ids))
i <- 1
for( id in fold.ids){
custom.folds[[i]] <- which( ddd$id %in% id )
i <- i+1
}
custom.folds
cv <- xgb.cv(params = list(eta = 0.1, max_depth = 5), dVal, nround = 10, folds = custom.folds, prediction = TRUE)
cv$evaluation_log
I now need to find a way to apply this to all 3 lists in the "new" added data.
Firstly, you should split the data onto dtrain (40 first rows) and dval (10 last rows). Secondly, you need rather xgb.train, not xgb.cv.
So, your code should be modified to something like that:
library(xgboost)
library(dplyr)
# you code regarding ddd
X <- ddd %>% select(-c(1:2))
Y <- ddd %>% select(c(1)) %>% pull()
dtrain <- xgb.DMatrix(data = as.matrix(X[1:40,]), label = as.numeric(Y[1:40,]))
dval <- xgb.DMatrix(data = as.matrix(X[41:50,]), label = as.numeric(Y[41:50,]))
watchlist <- list(train=dtrain, val=dval)
model <- xgb.train(data=dtrain, watchlist=watchlist, nround = 30, eta = 0.1, max_depth = 5)
IMHO, 40+10 rows only and so sparse features give no hope to obtain good results using XGBoost.

How to use ids from one dataframe to sum rows in another dataframe

I feel like this answer has been asked before, but I can't seem to find an answer to this question. Maybe my title is too vague, so feel free to change it.
So I have one data frame, a, with ids the correspond to column name in data frame b. Both data frames are simplified versions of a much larger data frame.
here is data frame a
a <- structure(list(V1 = structure(c(4L, 5L, 1L, 2L, 3L), .Label = c("GEN[D00105].GT",
"GEN[D00151].GT", "GEN[D00188].GT", "GEN[D86396].GT", "GEN[D86397].GT"
), class = "factor")), row.names = c(NA, -5L), class = "data.frame")
here is data frame b
b <- structure(list(`GEN[D01104].GT` = c(0, 0, 0, 0, 1, 0, 0, 2, 0,
1, 1, 1, 1, 0, 0, 0, 2, 0, 0, 0), `GEN[D01312].GT` = c(1, 0,
2, 2, 0, 0, 0, 0, 0, 1, 1, 0, 0, 2, 0, 0, 2, 0, 0, 0), `GEN[D01878].GT` = c(0,
0, 0, 2, 0, 0, 2, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 2, 0, 0), `GEN[D01882].GT` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 2, 0, 0, 0, 0), `GEN[D01952].GT` = c(0,
0, 1, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0), `GEN[D01953].GT` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 2, 0, 0, 0, 2, 0), `GEN[D02053].GT` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0), `GEN[D00316].GT` = c(0,
0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2, 0, 0), `GEN[D01827].GT` = c(0,
0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0), `GEN[D01881].GT` = c(0,
0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 0, 2, 0, 2, 0), `GEN[D02044].GT` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0), `GEN[D02085].GT` = c(0,
0, 0, 2, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0), `GEN[D02204].GT` = c(0,
0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0), `GEN[D02276].GT` = c(0,
0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0), `GEN[D02297].GT` = c(0,
0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0), `GEN[D02335].GT` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 2, 0, 0), `GEN[D02397].GT` = c(0,
0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0), `GEN[D00856].GT` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0), `GEN[D00426].GT` = c(0,
0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0), `GEN[D02139].GT` = c(0,
0, 1, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0), `GEN[D02168].GT` = c(0,
0, 2, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0)), row.names = c(NA,
-20L), class = "data.frame")
I want to be able to use the ids from data frame a to sum the row in data frame b that have a matching id if that makes sense.
So in the past, I just did something like
b$affected.samples <- (b$`GEN[D86396].GT` + b$`GEN[D86397].GT` + b$`GEN[D00105].GT` + b$`GEN[D00151].GT` + b$`GEN[D00188].GT`)
which got annoying and took to much time, so I moved over to
b$affected.samples <- rowSums(b[,c(1:5)])
Which isn't too bad for this example but with my large data set, my sample can be all over the place, and it's starting to take too much time to finds where everything is. I was hoping there is a way just to use my data frame a to sum the correct rows in data frame b.
Hopefully, I gave this is all the information you need! Let me know if you have any questions.
Thanks in advance!!
Extract the 'V1' column as a character string, use that to select the columns of 'b' (assuming these column names are found in 'b') and get the rowSums
rowSums( b[as.character(a$V1)], na.rm = TRUE)

How to make a regression with many data frames

I have data about gender and petitioning. I want to make a regression between the "Femme" (woman) variable and the different issues of the petitions. I have regrouped those issues into data frames under general themes, and those themes are what I want to regress with the "Femme" (woman) variable.
P.S.: Some petitions have many issues (ex.: water + science). So one petition could be counted in two data frames at the same time.
1) Here is what I did for all issues, this one is an example with the "Aboriginal" issue to show you how I coded the initial issues (you can also see the "Femme" variable at the beginning, which is already coded "0" and "1" in the original dataset under "Female"):
DataPetitions$Femme <- DataPetitions$Female
DataPetitions$Aboriginal <- NA
DataPetitions$Aboriginal[grepl("Aboriginal", DataPetitions$Issue)] <-1
DataPetitions$Aboriginal[!grepl("Aboriginal", DataPetitions$Issue)] <-0
# ... (same for all 24 specific issues)
2) Creating 7 data frames for general petitioning themes:
EnvironmentalIssues <- c(DataPetitions$AirQuality,DataPetitions$Biological, DataPetitions$Climate, DataPetitions$Environmental, DataPetitions$Toxic, DataPetitions$Waste, DataPetitions$Water)
EconomicIssues <- c(DataPetitions$Natural, DataPetitions$Transport)
SocialIssues <- c(DataPetitions$Aboriginal, DataPetitions$Health)
AgriculturalIssues <- c(DataPetitions$Agriculture,
DataPetitions$Fisheries, DataPetitions$Pesticides)
PoliticalIssues <- c(DataPetitions$Compliance, DataPetitions$Federal,
DataPetitions$Governance, DataPetitions$International)
ScientificIssues <- c(DataPetitions$Science)
OtherIssues <- c(DataPetitions$Other)
3) Trying to do a regression. This is my glm code:
model7 <- glm(DataPetitions$Femme ~ SocialIssues + PoliticalIssues +
ScientificIssues + EnvironmentalIssues + EconomicIssues +
AgriculturalIssues + OtherIssues, data = DataPetitions)
# When I try to run it, I get this error message:
Error in model.frame.default(formula = DataPetitions$Femme ~
SocialIssues + : variable lengths differ (found for
'SocialIssues')
With dput(head(DataPetitions,20)), I get this:
[...] class = "factor"), Femme = c(1, 1, 1, 0, 0, 0, 0, 0, 0,
0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1), AuMoinsUneFemme = c(1,
1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1),
Homme = c(1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1,
1, 1, 1, 2), AuMoinsUnHomme = c(1, 0, 0, 0, 1, 1, 0, 1, 1,
1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1), Individual1 = c(0, 0, 0,
1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0), Group1 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1),
Organisation1 = c(1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0,
0, 0, 1, 0, 0, 0, 0), Aboriginal = c(1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0), Agriculture = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0),
AirQuality = c(0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
0, 0, 1, 0, 0, 0), Biological = c(0, 1, 0, 0, 1, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), Climate = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1), Compliance = c(0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0),
Environmental = c(0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), Federal = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Fisheries = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0), Governance = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
Health = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0), International = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0), Natural = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0), Other = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
Pesticides = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), Science = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Toxic = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Transport = c(0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
Waste = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0), Water = c(0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0)), .Names = c("Data.", "Title", "Number", "Issue", "Petitioner", "Individual", "Group", "Organisation",
"Female", "Male", "Unknown", "DateReceived", "Status", "Summary",
"Hyperlink", "Femme", "AuMoinsUneFemme", "Homme", "AuMoinsUnHomme",
"Individual1", "Group1", "Organisation1", "Aboriginal", "Agriculture",
"AirQuality", "Biological", "Climate", "Compliance", "Environmental",
"Federal", "Fisheries", "Governance", "Health", "International",
"Natural", "Other", "Pesticides", "Science", "Toxic", "Transport",
"Waste", "Water"), row.names = c(NA, 20L), class = "data.frame")

Resources