How to make a regression with many data frames - r

I have data about gender and petitioning. I want to make a regression between the "Femme" (woman) variable and the different issues of the petitions. I have regrouped those issues into data frames under general themes, and those themes are what I want to regress with the "Femme" (woman) variable.
P.S.: Some petitions have many issues (ex.: water + science). So one petition could be counted in two data frames at the same time.
1) Here is what I did for all issues, this one is an example with the "Aboriginal" issue to show you how I coded the initial issues (you can also see the "Femme" variable at the beginning, which is already coded "0" and "1" in the original dataset under "Female"):
DataPetitions$Femme <- DataPetitions$Female
DataPetitions$Aboriginal <- NA
DataPetitions$Aboriginal[grepl("Aboriginal", DataPetitions$Issue)] <-1
DataPetitions$Aboriginal[!grepl("Aboriginal", DataPetitions$Issue)] <-0
# ... (same for all 24 specific issues)
2) Creating 7 data frames for general petitioning themes:
EnvironmentalIssues <- c(DataPetitions$AirQuality,DataPetitions$Biological, DataPetitions$Climate, DataPetitions$Environmental, DataPetitions$Toxic, DataPetitions$Waste, DataPetitions$Water)
EconomicIssues <- c(DataPetitions$Natural, DataPetitions$Transport)
SocialIssues <- c(DataPetitions$Aboriginal, DataPetitions$Health)
AgriculturalIssues <- c(DataPetitions$Agriculture,
DataPetitions$Fisheries, DataPetitions$Pesticides)
PoliticalIssues <- c(DataPetitions$Compliance, DataPetitions$Federal,
DataPetitions$Governance, DataPetitions$International)
ScientificIssues <- c(DataPetitions$Science)
OtherIssues <- c(DataPetitions$Other)
3) Trying to do a regression. This is my glm code:
model7 <- glm(DataPetitions$Femme ~ SocialIssues + PoliticalIssues +
ScientificIssues + EnvironmentalIssues + EconomicIssues +
AgriculturalIssues + OtherIssues, data = DataPetitions)
# When I try to run it, I get this error message:
Error in model.frame.default(formula = DataPetitions$Femme ~
SocialIssues + : variable lengths differ (found for
'SocialIssues')
With dput(head(DataPetitions,20)), I get this:
[...] class = "factor"), Femme = c(1, 1, 1, 0, 0, 0, 0, 0, 0,
0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1), AuMoinsUneFemme = c(1,
1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1),
Homme = c(1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1,
1, 1, 1, 2), AuMoinsUnHomme = c(1, 0, 0, 0, 1, 1, 0, 1, 1,
1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1), Individual1 = c(0, 0, 0,
1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0), Group1 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1),
Organisation1 = c(1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0,
0, 0, 1, 0, 0, 0, 0), Aboriginal = c(1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0), Agriculture = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0),
AirQuality = c(0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
0, 0, 1, 0, 0, 0), Biological = c(0, 1, 0, 0, 1, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), Climate = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1), Compliance = c(0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0),
Environmental = c(0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), Federal = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Fisheries = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0), Governance = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
Health = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0), International = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0), Natural = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0), Other = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
Pesticides = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), Science = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Toxic = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Transport = c(0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
Waste = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0), Water = c(0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0)), .Names = c("Data.", "Title", "Number", "Issue", "Petitioner", "Individual", "Group", "Organisation",
"Female", "Male", "Unknown", "DateReceived", "Status", "Summary",
"Hyperlink", "Femme", "AuMoinsUneFemme", "Homme", "AuMoinsUnHomme",
"Individual1", "Group1", "Organisation1", "Aboriginal", "Agriculture",
"AirQuality", "Biological", "Climate", "Compliance", "Environmental",
"Federal", "Fisheries", "Governance", "Health", "International",
"Natural", "Other", "Pesticides", "Science", "Toxic", "Transport",
"Waste", "Water"), row.names = c(NA, 20L), class = "data.frame")

Related

Errors with distance-decay using betapart and ddecay packages

My goal is to create a distance-decay curve for species data vs geographic distance. However, I am running into errors. For the betapart package, this may be due to the lack of columns relative to the number of rows. Is there a way to get past this? If not, is there another method for creating a distance-decay curve (and plotting it)? I also tried the ddecay package but ran into errors there too. Any help is much appreciated. Data is in structure form below.
# BETAPART -------------------------------------------------
library(betapart)
spat.dist<-dist(coords)
dissim.BCI<-beta.pair.abund(spec)$beta.bray.bal
plot(spat.dist, dissim.BCI, ylim=c(0,1), xlim=c(0, max(spat.dist)))
BCI.decay.exp<-decay.model(dissim.BCI, spat.dist, y.type="dissim", model.type="exp", perm=100)
#========================================================================================================
I also tried a few other packages --------------------------
# ddecay package -------------------------------------------
devtools::install_github("chihlinwei/ddecay")
the issue with this method is that it requires the use of a gradient however, I would like to avoid that if possible but I do not see a way around this. Also they do not include their example data in the package.
dd <- beta.decay(gradient=spat.dist, counts=decostand(spec, method="pa"),
coords=coords, nboots=1000,
dis.fun = "beta.pair", index.family = "sorensen", dis = 1, like.pairs=T)
x <- vegdist(coords, method = "euclidean")
y <- 1 - dist(decostand(spec, method="pa"), index.family = "sorensen")[[1]]
plot(x, y)
lines(dd$Predictions[, "x"], dd$Predictions[,"mean"], col="red", lwd=2)
#========================================================================================================
# DATA -----------------------------------------------------
spec <- structure(list(Ccol = c(0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Acol = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), NYcol = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0), Mcol = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), AAcol = c(14, 0, 14, 3, 11, 1, 0, 2, 0,
3, 0, 4, 0, 1, 8, 2, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 7),
Ncol = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1), ATBcol = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 20, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 3), CVcol = c(0, 0, 0, 0, 0, 0, 1, 20,
0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 2, 0, 0,
0, 6), AZNcol = c(0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), GBcol = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), KHAcol = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0), AFcol = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0), AFPcol = c(0,
0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1), TIAcol = c(4, 1, 0, 2, 6, 0,
1, 1, 0, 2, 0, 0, 0, 1, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0), AUcol = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), AScol = c(0,
4, 0, 2, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 5, 0, 0), NSAcol = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 7, 0, 0, 3, 0, 0, 0, 4, 0, 2, 0, 1, 0, 9, 5, 1,
0, 0, 2, 0), WZcol = c(0, 0, 0, 0, 0, 0, 1, 0, 0, 10, 4,
0, 0, 0, 0, 0, 0, 1, 5, 0, 0, 0, 17, 4, 0, 0, 0, 0, 0), AJcol = c(0,
3, 6, 0, 0, 1, 0, 4, 0, 0, 0, 0, 39, 12, 0, 0, 0, 0, 0, 0,
0, 4, 5, 1, 12, 13, 16, 0, 5), EADcol = c(4, 1, 2, 1, 2,
0, 0, 0, 0, 4, 0, 2, 1, 1, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0,
0, 0, 0, 0, 1), CAcol = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), Pcol = c(0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 60, 0, 0,
13, 0, 8, 1, 0, 0, 0, 0, 0), ASDcol = c(3, 5, 6, 17, 3, 5,
26, 2, 0, 17, 3, 10, 6, 3, 2, 4, 0, 0, 5, 25, 0, 0, 0, 2,
2, 9, 0, 2, 8), RMAcol = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
OUcol = c(0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), KAcol = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12,
0, 0, 0, 0, 0, 8, 1), PACcol = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 2, 0, 37, 0, 24,
1, 0, 0), LAAcol = c(0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), GAcol = c(1,
0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0,
0, 0, 3, 0, 0, 0, 2, 0, 0), AAcol = c(1, 0, 1, 0, 0, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0), EVAcol = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0), EAcol = c(0,
0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), AKcol = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0), Acol = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 0), QAcol = c(0,
0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), YAcol = c(11, 24, 21, 63, 44,
95, 12, 43, 0, 5, 26, 22, 25, 48, 86, 2, 0, 0, 13, 0, 0,
2, 0, 0, 60, 6, 7, 0, 45), BANcol = c(0, 0, 0, 3, 0, 0, 0,
0, 0, 0, 0, 0, 24, 0, 6, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0,
9, 17, 17), VCcol = c(0, 38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Vcol = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0), Ocol = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0), AVcol = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1), JXcol = c(0,
3, 3, 0, 0, 0, 0, 0, 8, 0, 0, 10, 3, 0, 0, 5, 0, 0, 0, 1,
0, 0, 0, 2, 4, 1, 0, 0, 0)), class = "data.frame", row.names = c(NA,
-29L))
coords <- structure(list(Lat.x = c(34.43363, 34.36784, 34.32587, 34.19891,
34.24217, 34.24863, 34.18137, 34.16838, 34.10961, 34.08329, 34.40571,
34.39591, 34.39292, 34.37466, 34.28948, 34.26146, 34.04687, 34.0409,
34.068339, 34.34679, 34.17161, 34.23308, 34.21544, 34.14922,
34.27539, 34.2323, 34.19057, 34.07042, 34.06289), Lon.x = c(-94.94494,
-94.92512, -94.94429, -94.84497, -94.8573, -94.85641, -94.887,
-94.91322, -94.92913, -94.93276, -95.02622, -95.04382, -94.96295,
-94.83733, -94.81071, -94.79161, -95.03968, -95.0608, -95.086986,
-95.03345, -95.23862, -95.25619, -95.1041, -95.02286, -95.02672,
-95.02626, -95.02941, -95.01746, -94.98786)), class = "data.frame", row.names = c(NA,
-29L))
You can get more answers, if you tell what was the problem. For instance, which functions failed and what was the error message. I had a look at betapart::decay.model(), where I could get this error message:
Error in eval(family$initialize) :
cannot find valid starting values: please specify some
I cut the long story short: you cannot use this function with your data because you have dissimilarities of 1 in your data, dissimilarities are turned into similarities with 1-dissimilarity and this makes these values zero similarities (that is, these pairs of sampling unit have nothing in common, they share no species). Function decay.model uses glm with gaussian family with log-link, and log-link requires that you give the starting values, if you have zeros in the y-variate.
I think that you have four alternatives:
You do not use the method as it does not suit your data.
You modify the decay.model function so that you can specify the starting values, like the error message suggested. This means that you add mustart to the function call so that it reads, e.g., glm(y ~ x, family=gaussian(link="log"), mustart=pmax(y, 0.01)). This replaces zeros with 0.01 as starting values.
You change maximum distances from 1 to something smaller, for instance, 0.99: dissim.BCI[dissim.BCI==1] <- 0.99. However, this changes the data, and also changes the results from those you get with alternative 2 (which only changes starting values, but data are unmodified). However, the effect is not very large and any Bayesian would claim that dissimilarity 1 is just a frequentist folly (you just haven't seen the case that is in common with these sampling units).
You change the maximum distance to missing values. This will change data more than alternative 3. It removes maximum dissimilarities and these no longer influence the decay curve. The effect is the same as censoring greatest dissimilarities. The results change more than in alternative 3.

Problems with merging two files with yearly binary data for two overlapping subsets of individuals

I work with mark-recaptures of animals, and I have two capture histories I need to merge. Both files look like this:
Both files include subsets of the same group of animals, however, all inividuals are not present in both files. Also, one file contains more YEARS (in columns) than the other. The 0's and 1's indicate whether the animal was observed this year or not.
I need to merge both files, ending up with a file that contains all individuals that are included in these files. Observation data need to be merged for those individuals that are present in both files. If observation status for a given animal is 0 in FILE1 and 0 in FILE2, the observation status in the merged file need to be 0, if 0 in FILE1 and 1 in FILE2, observation status in the merged file should be 1, and if 1 in both files, it still needs to be 1 in the merged file (NOT 2).
Below you'll find samples of both files, FILE1 and FILE2. Any help appreciated.
FILE1:
> dput(FILE1)
structure(list(ID = c("1", "LL-30", "M-300", "NKW-001", "NKW-002",
"NKW-003", "NKW-004", "NKW-006", "NKW-007", "NKW-009", "NKW-010",
"NKW-011", "NKW-012", "NKW-013", "NKW-014", "NKW-015", "NKW-016",
"NKW-017", "NKW-018", "NKW-019", "NKW-021", "NKW-022", "NKW-023",
"NKW-024", "NKW-025", "NKW-026", "NKW-028", "NKW-029", "NKW-030",
"NKW-031", "NKW-032", "NKW-033", "NKW-034", "NKW-035", "NKW-036",
"NKW-037", "NKW-038", "NKW-039", "NKW-040"), `1986` = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `1987` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `1988` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `1989` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `1990` = c(0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `1991` = c(0,
0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `1992` = c(0,
0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `1993` = c(0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `1994` = c(0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `1995` = c(1,
0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `1996` = c(0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `1997` = c(0,
0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `1998` = c(1,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `1999` = c(1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `2000` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `2001` = c(0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `2002` = c(1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `2003` = c(1,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `2004` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `2005` = c(1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0), `2006` = c(0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `2007` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `2008` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `2012` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `2013` = c(0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), `2014` = c(0,
0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0), `2015` = c(0,
0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1), `2016` = c(0,
0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0), `2017` = c(0,
0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1), `2018` = c(0,
0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1), `2019` = c(0,
0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1)), class = "data.frame", row.names = c(NA,
-39L))
FILE2:
> dput(FILE2)
structure(list(ID = c("KI03", "KI05", "KI06", "KI07", "KI08",
"KI10", "NKW-001", "NKW-004", "NKW-005", "NKW-009", "NKW-019",
"NKW-023", "NKW-025", "NKW-027", "NKW-031", "NKW-032", "NKW-040",
"NKW-045", "NKW-424", "NKW-431", "NKW-441", "NKW-443"), `2008` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0
), `2009` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), `2010` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `2011` = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1), `2012` = c(0,
0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
), `2013` = c(1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0), `2014` = c(0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `2015` = c(1, 1, 1, 0, 1, 1,
1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0), `2016` = c(1,
0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0
), `2017` = c(1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
0, 0, 1, 0, 0, 0), `2018` = c(1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `2019` = c(0, 0, 0, 1, 1, 1,
0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0), `2020` = c(0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
)), class = "data.frame", row.names = c(NA, -22L))
Here is a scalable data.table solution with no merging involved.
If you have got more files, just add them to the list L
library( data.table )
setDT(df1);setDT(df2) #set to data.table format
L <- list( df1, df2 ) #put the data.tables in a list
#melt all data.tables in the list to long format
L.melt <- lapply( L, melt, id.vars = "ID", variable.name = "year", variable.factor = FALSE )
#rowbind to one large data.table
DT <- data.table::rbindlist( L.melt, use.names = TRUE, fill = TRUE )
#summarise, output a logical TRUE (=1) of FALSE = 0 based on the sum of 0's and 1's
ans <- DT[, .( seen = as.numeric( sum(value) > 0 ) ), by = .(ID, year) ]
#cast to wide again, fill in missing observations in years with 0
dcast( ans, ID ~ year, value.var = "seen", fill = 0 )

Frequency bar graph in descending order

I created a frequency graph using ggplot2. I would like the bars to go in descending order based on frequency counts. So language measures from left to right, BNT, WAB_R, BDAE...etc. Of note, my dataframe is organized with the language measures are columns and the cases are rows. The values are 0 or 1 and 1 means that the participant endorsed the language measure. I have tried using reorder in various combinations but had no luck. I appreciate the help!
Here is sample data:
WAB-R BDAE BNT CAT
1 0 0 1 0
2 1 0 1 1
3 0 0 0 0
4 1 1 0 0
5 0 0 0 1
6 0 1 1 0
7 1 0 0 0
8 0 1 1 0
Portion of the Data Show in New WindowClear
OutputExpand/Collapse Output
structure(list(WAB_R = c(0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0), WAB_B = c(1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0), BDAE = c(0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1), CAT = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0), BNT = c(0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1), PNT = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), PyramidPalms = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), QAB = c(0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0), PALPA = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0), BASA = c(0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Compiled_lang = c(0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0)), row.names = c(NA, -85L), class = c("tbl_df", "tbl", "data.frame"))
Code:
library(tidyverse) survey %>% select(c(WAB_R:other_lang_measure)) %>% pivot_longer(everything()) %>% filter(value==1) %>% ggplot(aes(x= value))+ geom_histogram(stat = 'count',aes(fill=name), position = position_dodge2(0.9,preserve = 'single'))+ labs(fill='Language Measures') + theme(axis.title.x=element_blank(), axis.text.x=element_blank(),axis.ticks.x=element_blank()) + scale_y_continuous(breaks=seq(0,50,5))+ ylab("Frequency Counts") + coord_cartesian(ylim=c(0, 45))+ ggtitle("\nLanguage Measures\n ")+ cleanup
As mentioned by sage #r2evans you would need to format as factor the x-axis variable. Also, you can compute the counts directly using summarise() and then arrange in order to sketch the plot:
library(tidyverse)
#Code
survey %>%
select(c(WAB_R:Compiled_lang)) %>%
pivot_longer(everything()) %>% filter(value==1) %>%
group_by(name) %>%
summarise(value=sum(value)) %>%
arrange(desc(value)) %>%
mutate(name=factor(name,levels = unique(name),ordered = T)) %>%
ggplot(aes(x= name,y=value))+
geom_bar(stat = 'identity',aes(fill=name),
position = position_dodge2(0.9,preserve = 'single'))+
labs(fill='Language Measures') +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),axis.ticks.x=element_blank()) +
scale_y_continuous(breaks=seq(0,50,5))+
ylab("Frequency Counts") +
coord_cartesian(ylim=c(0, 45))+ ggtitle("\nLanguage Measures\n ")
Output:
Here is a way:
survey <- structure(list(WAB_R = c(0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0),
WAB_B = c(1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0),
BDAE = c(0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1),
CAT = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0),
BNT = c(0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1),
PNT = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
PyramidPalms = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
QAB = c(0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0),
PALPA = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0),
BASA = c(0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
Compiled_lang = c(0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0)),
row.names = c(NA, -85L), class = c("tbl_df", "tbl", "data.frame"))
library(tidyverse,warn.conflicts = F)
survey %>%
pivot_longer(everything()) %>%
filter(value==1) %>%
count(name) %>%
ggplot(aes(x= name, y = n)) +
geom_col() +
labs(y = "Frequency Counts", title = "Language Measures", x = "")
survey %>%
pivot_longer(everything()) %>%
filter(value==1) %>%
count(name) %>%
ggplot(aes(x= name, y = n)) +
geom_col() +
labs(y = "Frequency Counts", title = "Language Measures", x = "") +
coord_flip()
Created on 2021-01-15 by the reprex package (v0.3.0)

xgb.cv with no folds and return the results based on a split of the data

I have some data which looks like:
# A tibble: 50 x 28
sanchinarro date holiday weekday weekend workday_on_holi… weekend_on_holi… protocol_active
<dbl> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 -1.01 2010-01-01 1 1 0 1 0 0
2 0.832 2010-01-02 0 0 1 0 0 0
3 1.29 2010-01-03 0 0 1 0 0 0
4 1.04 2010-01-04 0 1 0 0 0 0
5 0.526 2010-01-05 0 1 0 0 0 0
6 -0.292 2010-01-06 1 1 0 1 0 0
7 -0.394 2010-01-07 0 1 0 0 0 0
8 -0.547 2010-01-08 0 1 0 0 0 0
9 -0.139 2010-01-09 0 0 1 0 0 0
10 0.628 2010-01-10 0 0 1 0 0 0
I want to run xgb.cv on the first 40 rows and validate it on the final 10 rows.
I try the following:
library(xgboost)
library(dplyr)
X_Val <- ddd %>% select(-c(1:2))
Y_Val <- ddd %>% select(c(1)) %>% pull()
dVal <- xgb.DMatrix(data = as.matrix(X_Val), label = as.numeric(Y_Val))
xgb.cv(data = dVal, nround = 30, folds = NA, params = list(eta = 0.1, max_depth = 5))
which gives me this error:
Error in xgb.cv(data = dVal, nround = 30, folds = NA, eta = 0.1,
max_depth = 5) : 'folds' must be a list with 2 or more elements
that are vectors of indices for each CV-fold
How can I run a simple xgb.cv on the first 40 rows and test it on the last 10 rows.
I eventually want to apply a gird search with a list of parameters and save the results in a list. Since I am dealing with time series data I do not want to mix the folds up, I just want a simple train and in-sample test of 40:10.
Data:
ddd <- structure(list(sanchinarro = c(-1.00742964973274, 0.832453587904369,
1.29242439731365, 1.03688505875294, 0.525806381631517, -0.291919501762755,
-0.394135237187039, -0.547458840323464, -0.138595898626329, 0.628022117055801,
1.19020866188936, 1.5990716035865, 1.5990716035865, -0.70078244345989,
2.11015028070792, 1.95682667757149, 0.985777191040795, 0.883561455616511,
0.985777191040795, 0.270267043070807, 2.51901322240505, 2.41679748698077,
0.372482778495091, -0.291919501762755, -0.905213914308458, -0.905213914308458,
-0.649674575747748, 1.2413165296015, 1.54796373587436, -0.70078244345989,
-0.905213914308458, -0.0363801632020448, 1.54796373587436, 2.00793454528363,
1.54796373587436, -0.445243104899181, -0.445243104899181, 1.03688505875294,
0.628022117055801, -0.496350972611323, 0.168051307646523, -0.649674575747748,
0.0658355722222391, -1.00742964973274, -0.291919501762755, 0.0147277045100972,
0.168051307646523, -0.189703766338471, 0.219159175358665, 0.679129984767943
), date = structure(c(14610, 14611, 14612, 14613, 14614, 14615,
14616, 14617, 14618, 14619, 14620, 14621, 14622, 14623, 14624,
14625, 14626, 14627, 14628, 14629, 14630, 14631, 14632, 14633,
14634, 14635, 14636, 14637, 14638, 14639, 14640, 14641, 14642,
14643, 14644, 14645, 14646, 14647, 14648, 14649, 14650, 14651,
14652, 14653, 14654, 14655, 14656, 14657, 14658, 14659), class = "Date"),
holiday = c(1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), weekday = c(1,
0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,
1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1), weekend = c(0, 1, 1, 0,
0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
0, 1, 1, 0, 0, 0, 0, 0), workday_on_holiday = c(1, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), weekend_on_holiday = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), protocol_active = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), text_broken_clouds = c(0,
1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1), text_clear = c(0, 0, 0,
0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1,
0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 1), text_fog = c(0, 1, 0, 1, 1, 0,
0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 1, 0, 1, 0), text_partly_cloudy = c(0, 1, 0, 0, 0,
1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), text_partly_sunny = c(1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0,
0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
0, 0, 0, 0, 1, 1, 1), text_passing_clouds = c(1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0,
0, 0, 0, 0, 0, 1, 1, 1), text_scattered_clouds = c(1, 1,
0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1), text_sunny = c(0, 0, 0, 0,
0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1), month_1 = c(1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0), month_2 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1), month_3 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), month_4 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), month_5 = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), month_6 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0), month_7 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0), month_8 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), month_9 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), month_10 = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), month_11 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0), month_12 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-50L))
EDIT: List data:
The final data comes in the form of lists.
datalst <- list(structure(list(sanchinarro = c(-1.00742964973274, 0.832453587904369,
1.29242439731365, 1.03688505875294, 0.525806381631517, -0.291919501762755,
-0.394135237187039, -0.547458840323464, -0.138595898626329, 0.628022117055801,
1.19020866188936, 1.5990716035865, 1.5990716035865, -0.70078244345989
), date = structure(c(14610, 14611, 14612, 14613, 14614, 14615,
14616, 14617, 14618, 14619, 14620, 14621, 14622, 14623), class = "Date"),
holiday = c(1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0), weekday = c(1,
0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1), weekend = c(0, 1,
1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0), workday_on_holiday = c(1,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0), weekend_on_holiday = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), protocol_active = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), text_broken_clouds = c(0,
1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), text_clear = c(0,
0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0), text_fog = c(0, 1,
0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0), text_partly_cloudy = c(0,
1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0), text_partly_sunny = c(1,
1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1), text_passing_clouds = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), text_scattered_clouds = c(1,
1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1), text_sunny = c(0,
0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0), month_1 = c(1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), month_2 = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), month_3 = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), month_4 = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), month_5 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), month_6 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), month_7 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), month_8 = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0), month_9 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0), month_10 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0), month_11 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0), month_12 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-14L)), structure(list(sanchinarro = c(0.832179838392013, 1.29225734336885,
1.03665872949283, 0.525461501740789, -0.292454062662475, -0.394693508212883,
-0.548052676538495, -0.139094894336863, 0.627700947291197, 1.19001789781844,
1.59897568002007, 1.59897568002007, -0.701411844864107, 2.11017290777211
), date = structure(c(14611, 14612, 14613, 14614, 14615, 14616,
14617, 14618, 14619, 14620, 14621, 14622, 14623, 14624), class = "Date"),
holiday = c(0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), weekday = c(0,
0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1), weekend = c(1, 1,
0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0), workday_on_holiday = c(0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), weekend_on_holiday = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), protocol_active = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), text_broken_clouds = c(1,
0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0), text_clear = c(0,
0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1), text_fog = c(1, 0,
1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0), text_partly_cloudy = c(1,
0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0), text_partly_sunny = c(1,
1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0), text_passing_clouds = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), text_scattered_clouds = c(1,
0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0), text_sunny = c(0,
0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1), month_1 = c(1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), month_2 = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), month_3 = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), month_4 = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), month_5 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), month_6 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), month_7 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), month_8 = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0), month_9 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0), month_10 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0), month_11 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0), month_12 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-14L)), structure(list(sanchinarro = c(1.29293502084952, 1.03729933727253,
0.526027970118536, -0.292006217327851, -0.394260490758649, -0.547641900904846,
-0.138624807181653, 0.628282243549334, 1.19068074741873, 1.59969784114192,
1.59969784114192, -0.701023311051044, 2.11096920829591, 1.95758779814971
), date = structure(c(14612, 14613, 14614, 14615, 14616, 14617,
14618, 14619, 14620, 14621, 14622, 14623, 14624, 14625), class = "Date"),
holiday = c(0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), weekday = c(0,
1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0), weekend = c(1, 0,
0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1), workday_on_holiday = c(0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), weekend_on_holiday = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), protocol_active = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), text_broken_clouds = c(0,
1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1), text_clear = c(0,
0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0), text_fog = c(0, 1,
1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0), text_partly_cloudy = c(0,
0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0), text_partly_sunny = c(1,
1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1), text_passing_clouds = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), text_scattered_clouds = c(0,
0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0), text_sunny = c(0,
0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0), month_1 = c(1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), month_2 = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), month_3 = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), month_4 = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), month_5 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), month_6 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0), month_7 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), month_8 = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0), month_9 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0), month_10 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0), month_11 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0), month_12 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-14L)))
EDIT:
I think this gives me what I am after - I need to double/tripple check it. (if you see any errors please let me know)
splt <- 0.80 * nrow(ddd)
ddd[c(1:splt), "id"] = 1
ddd$id[is.na(ddd$id)] = 2
fold.ids <- unique(ddd$id)
custom.folds <- vector("list", length(fold.ids))
i <- 1
for( id in fold.ids){
custom.folds[[i]] <- which( ddd$id %in% id )
i <- i+1
}
custom.folds
cv <- xgb.cv(params = list(eta = 0.1, max_depth = 5), dVal, nround = 10, folds = custom.folds, prediction = TRUE)
cv$evaluation_log
I now need to find a way to apply this to all 3 lists in the "new" added data.
Firstly, you should split the data onto dtrain (40 first rows) and dval (10 last rows). Secondly, you need rather xgb.train, not xgb.cv.
So, your code should be modified to something like that:
library(xgboost)
library(dplyr)
# you code regarding ddd
X <- ddd %>% select(-c(1:2))
Y <- ddd %>% select(c(1)) %>% pull()
dtrain <- xgb.DMatrix(data = as.matrix(X[1:40,]), label = as.numeric(Y[1:40,]))
dval <- xgb.DMatrix(data = as.matrix(X[41:50,]), label = as.numeric(Y[41:50,]))
watchlist <- list(train=dtrain, val=dval)
model <- xgb.train(data=dtrain, watchlist=watchlist, nround = 30, eta = 0.1, max_depth = 5)
IMHO, 40+10 rows only and so sparse features give no hope to obtain good results using XGBoost.

Drop first element of list of lists, condense list of lists? Too many elements?

Hello wonderful community!
I am having trouble working with a list of lists, each containing 4 DF. I only provide two lists here but I have 54. What I want to do: (1) I want to summarize the amount of rows in each "Comm". (2) I want to drop "comm". (3) I want to combine all of the other elements in each list, into one data frame.
Data below
I have tried all these approaches as well as others.
R list of lists to data.frame
R - list to data frame
Export each data frame within a list to csv
Any ideas on how to do this? Thanks!
> dput(c(output$`2000-Fall`,output$`2001-Spring`))
list(Comm = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), .Dim = c(37L,
27L), .Dimnames = list(NULL, c("AS", "LNS", "SC", "BND", "RT",
"LND", "EBT", "BM", "CS", "BT", "BB", "WS", "GS", "CP", "YB",
"F", "P", "LMB", "YP", "RP", "B", "TD", "AE", "CCS", "A", "FSS",
"M"))), Coherence = structure(list(name = structure(c(1L, 6L,
3L, 4L, 5L, 2L), .Label = c("embAbs", "method = r1", "p", "simMean",
"simVariance", "z"), class = "factor"), stat = c(200, 11.3001899623484,
1.30934461418281e-29, 604.17, 35.7666553701019, NA)), class = "data.frame", row.names = c(NA,
-6L)), Turnover = structure(list(name = structure(c(5L, 6L, 2L,
3L, 4L, 1L), .Label = c("method = EMS", "p", "simMean", "simVariance",
"turnover", "z"), class = "factor"), stat = c(10893, -6.01765438449338,
1.7696251436801e-09, 5823.65, 842.412952971008, NA)), class = "data.frame", row.names = c(NA,
-6L)), Boundary = structure(list(name = structure(c(2L, 3L, 1L
), .Label = c("df", "index", "p"), class = "factor"), stat = c(2.17647058823529,
8.59400834470891e-05, 34)), class = "data.frame", row.names = c(NA,
-3L)), Comm = structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), .Dim = 33:32, .Dimnames = list(
NULL, c("TT", "BM", "LC", "RT", "SC", "CS", "AS", "CRC",
"LND", "BND", "EBT", "BT", "TD", "BL", "F", "SMB", "WS",
"RP", "P", "GSF", "YP", "AE", "CP", "LMB", "YB", "BS", "BB",
"GS", "B", "FSS", "K", "M"))), Coherence = structure(list(
name = structure(c(1L, 6L, 3L, 4L, 5L, 2L), .Label = c("embAbs",
"method = r1", "p", "simMean", "simVariance", "z"), class = "factor"),
stat = c(261, 8.8243058044828, 1.10140877775716e-18, 611.43,
39.711905702767, NA)), class = "data.frame", row.names = c(NA,
-6L)), Turnover = structure(list(name = structure(c(5L, 6L, 2L,
3L, 4L, 1L), .Label = c("method = EMS", "p", "simMean", "simVariance",
"turnover", "z"), class = "factor"), stat = c(10074, -4.74001047198576,
2.13707173521215e-06, 5701.71, 922.422012744687, NA)), class = "data.frame", row.names = c(NA,
-6L)), Boundary = structure(list(name = structure(c(2L, 3L, 1L
), .Label = c("df", "index", "p"), class = "factor"), stat = c(1.82705314009662,
0.000113336323694813, 30)), class = "data.frame", row.names = c(NA,
-3L)))
The object shown in the question is not a list of lists of 4 data frames each so if we call that object L then to get what is described we need to create LL like this. Now using LL we can compute the 3 items requested.
LL <- list(L[1:4], L[5:8])
res1 <- sapply(LL, function(x) nrow(x$Comm))
res2 <- lapply(LL, function(x) x[names(x) != "Comm"])
res3 <- lapply(LL, function(x) do.call("rbind", x[names(x) != "Comm"]))

Resources