invalid number of intervals with partykit decision trees - r

I'm trying to replicate the procedure proposed here on my data but I get the following error:
Error in interval.numeric(x, breaks = c(xmin - tol, ux, xmax)) :
invalid number of intervals
target is the categorical variable that I want to predict while I would force the first split of the classification tree to be done according to split.variable (categorical too). Due to the object characteristics, indeed, if split.variable is 1 target can be only 1, while if it is 0, target can be or 0 or 1.
Initially I treated them as factors but I changed them to numeric and then rounded (as suggested in other posts in SO). Unfortunately, none of these solutions were helpful.
I played a bit with the data, subsampling cols and rows but still it doesn't work.
What am I missing?
Here is an MRE to replicate the error:
library(partykit)
tdf = structure(list(target = c(0, 0, 0, 1, 0, 0, 1, 1, 1, 1), split.variable = c(0,
0, 0, 0, 1, 0, 0, 0, 0, 0), var1 = c(2.021, 1.882, 1.633, 3.917,
2.134, 1.496, 1.048, 1.552, 1.65, 3.112), var2 = c(97.979, 98.118,
98.367, 96.083, 97.866, 98.504, 98.952, 98.448, 98.35, 96.888
), var3 = c(1, 1, 1, 0.98, 1, 1, 1, 1, 1, 1), var4 = c(1, 1,
1, 0.98, 1, 1, 1, 1, 1, 1), var5 = c(18.028, 25.207, 20.788,
28.548, 18.854, 19.984, 27.352, 24.622, 25.037, 24.067), var6 = c(0.213,
0.244, 0.289, 0.26, 0.887, 0.575, 0.097, 0.054, 0.104, 0.096),
var7 = c(63.22, 59.845, 62.45, 63.48, 52.143, 51.256, 56.296,
57.494, 59.543, 68.434), var8 = c(0.748, 0.795, 0.807, 0.793,
0.901, 0.909, 0.611, 0.61, 0.618, 0.589)), row.names = c(6L,
7L, 8L, 9L, 11L, 12L, 15L, 16L, 17L, 18L), class = "data.frame")
tr1 <- ctree(target ~ split.variable, data = tdf, maxdepth = 1)
tr2 <- ctree(target ~ split.variable + ., data = tdf, subset = predict(tr1, type = "node") == 2)

Your data set is too small to do what you want:
With just 10 observations tr1 does not lead to any splits but produces a tree with a single root node.
Consequently, predict(tr1, type = "node") produces a vector of 10 times 1.
Thus, the subset with predict(tr1, type = "node") == 2 is empty (all FALSE).
This leads to an (admittedly cryptic) error message, reflecting that you cannot learn a tree from an empty data set.
Additionally: I'm not sure where you found the recommendation to use numeric codings of categorical variables. But for partykit you are almost always better off coding categorical variables appropriately as factor variables.

Related

R: How do I sort a dataframe based on a numeric vector?

I want to calculate the fold change between thyroid and testes dataframe using TPM values and provide the top 10 genes overexpressed in testes tissue (testes$gene_id in the testes dataframe).
In my code below, I first calculated the fold change and store it as a numeric vector tpm.foldchange but then I don't know how to sort the gene_id column of the testes dataframe based on the sorted fold-change values tpm.foldchange.
# Parse the gene results file from the testes and thyroid output
thyroid <- read.table("thyroid.genes.results", header=T, sep="\t")
testes <- read.table("testes.genes.results", header=T, sep="\t")
# Extract the TPM values
# Add one to each value and log them (base 2)
library(tidyverse)
thyroid.tpm <- log(thyroid %>% pull(TPM) + 1)
testes.tpm <- log(testes %>% pull(TPM) + 1)
# Pearson's correlation coefficient between thyroid and testes using TPM
cor(thyroid.tpm, testes.tpm, method="pearson")
# Calculate fold change between the testes and thyroid tissue TPM values and provide top 10 genes that are overexpressed in testes
library(gtools)
tpm.foldchange <- foldchange(testes.tpm, thyroid.tpm)
#tpm.df <- merge(testes.tpm, tpm.foldchange)
tpm.sorted <- sort(tpm.foldchange, decreasing=T)
tpm.sortedgenes <- testes[order(factor(testes$TPM, levels=tpm.sorted)),]
tpm.top10genes <- head(tpm.sortedgenes, 10)
testes[order(factor(testes$TPM, levels=tpm.sorted)),]
I initially wanted to sort after merging like this:
tpm.df <- merge(testes.tpm, tpm.foldchange)
tpm.sorted <- sort(tpm.df$tpm.foldchange, decreasing=T)
but it raised an error:
Error: cannot allocate vector of size 8.0 Gb
thyroid dataframe:
# Show only the first 20 rows, first column, and 6th column of thyroid dataframe
dput(thyroid[1:20, c(1,6)])
structure(list(gene_id = c("gene0_DDX11L1", "gene1_WASH7P", "gene100_C1orf233",
"gene1000_ZC3H12A", "gene10000_CD86", "gene10001_CASR", "gene10003_CSTA",
"gene10004_CCDC58", "gene10005_FAM162A", "gene10006_WDR5B", "gene10007_LOC102723582",
"gene10008_KPNA1", "gene1001_MIR6732", "gene10010_PARP9", "gene10011_DTX3L",
"gene10012_PARP15", "gene10015_PARP14", "gene10016_HSPBAP1",
"gene10017_DIRC2", "gene10018_LOC100129550"), TPM = c(0, 45.96,
2.72, 2.4, 1.67, 5.14, 4.33, 47.68, 81.1, 10.12, 0.96, 45.21,
0, 19.63, 15.06, 0.49, 21.76, 12.16, 19.37, 5.3)), row.names = c(NA,
20L), class = "data.frame")
testes dataframe:
# Show only the first 20 rows, first column, and 6th column of testes dataframe
dput(testes[1:20, c(1,6)])
structure(list(gene_id = c("gene0_DDX11L1", "gene1_WASH7P", "gene100_C1orf233",
"gene1000_ZC3H12A", "gene10000_CD86", "gene10001_CASR", "gene10003_CSTA",
"gene10004_CCDC58", "gene10005_FAM162A", "gene10006_WDR5B", "gene10007_LOC102723582",
"gene10008_KPNA1", "gene1001_MIR6732", "gene10010_PARP9", "gene10011_DTX3L",
"gene10012_PARP15", "gene10015_PARP14", "gene10016_HSPBAP1",
"gene10017_DIRC2", "gene10018_LOC100129550"), TPM = c(2.33, 47.56,
9.45, 2.03, 3.09, 0.11, 3.73, 28.52, 120.65, 6.89, 1.38, 30.89,
0, 20.39, 13.66, 0.59, 9.62, 22.04, 7.42, 2.53)), row.names = c(NA,
20L), class = "data.frame")
Based on Akrun's comment, I've attempted:
library(gtools)
tpm.foldchange <- foldchange(thyroid.tpm, testes.tpm)
testes.sorted <- testes %>%
left_join(thyroid, by="gene_id") %>%
mutate(TPM=testes.tpm, tpm.foldchange, .keep="unused") %>%
slice_max(n=10, order_by=tpm.foldchange)
Output:
> dim(testes.sorted)
[1] 304 15
> dput(testes.sorted[1:10,])
structure(list(gene_id = c("gene10075_LOC101927056", "gene10311_A4GNT",
"gene10394_SLC9A9-AS1", "gene10504_SUCNR1", "gene10511_TMEM14E",
"gene10798_LOC102724550", "gene10990_FLJ42393", "gene11054_DPPA2P3",
"gene11065_GP5", "gene11400_USP17L12"), transcript_id.s..x = c("rna28860_NR_125396.1,rna28861_NR_125395.1",
"rna29540_NM_016161.2", "rna29785_NR_048544.1", "rna30020_NM_033050.4",
"rna30060_NM_001123228.1", "rna30716_NR_110826.1", "rna31241_NR_024413.1",
"rna31390_NR_027764.1", "rna31430_NM_004488.2", "rna32519_NM_001256853.1"
), length.x = c(659, 1771, 518, 1650, 1293, 2957, 2266, 1146,
3493, 1593), effective_length.x = c(413.57, 1525.5, 272.62, 1404.5,
1047.5, 2711.5, 2020.5, 900.5, 3247.5, 1347.5), expected_count.x = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0.12), TPM.x = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), FPKM.x = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), transcript_id.s..y = c("rna28860_NR_125396.1,rna28861_NR_125395.1",
"rna29540_NM_016161.2", "rna29785_NR_048544.1", "rna30020_NM_033050.4",
"rna30060_NM_001123228.1", "rna30716_NR_110826.1", "rna31241_NR_024413.1",
"rna31390_NR_027764.1", "rna31430_NM_004488.2", "rna32519_NM_001256853.1"
), length.y = c(796, 1771, 518, 1650, 1293, 2957, 2266, 1146,
3493, 1593), effective_length.y = c(535.05, 1510.04, 257.15,
1389.04, 1032.04, 2696.04, 2005.04, 885.04, 3232.04, 1332.04),
expected_count.y = c(9, 3, 2, 233, 2, 2, 36, 2, 35, 1.91),
TPM.y = c(0.58, 0.07, 0.27, 5.8, 0.07, 0.03, 0.62, 0.08,
0.37, 0.05), FPKM.y = c(0.29, 0.03, 0.14, 2.94, 0.03, 0.01,
0.31, 0.04, 0.19, 0.03), TPM = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0), tpm.foldchange = c(Inf, Inf, Inf, Inf, Inf, Inf, Inf,
Inf, Inf, Inf)), row.names = c(NA, 10L), class = "data.frame")
This code returns a dataframe with (304, 15) dimensions. But I'm only looking for the top ten genes. Also, please note that thyroid.tpm is the log2-transformed TPM values.
If we want to order by the foldchange, do a join first, and arrange based on the foldchange between the 'TPM' columns
library(dplyr)
library(gtools)
testes2 <- testes %>%
left_join(thyroid, by = 'gene_id') %>%
mutate(across(starts_with("TPM"), ~ log(.x + 1),
.names = "tpm_{.col}")) %>%
mutate(foldchange = foldchange(tpm_TPM.x, tpm_TPM.y)) %>%
filter(is.finite(foldchange)) %>%
arrange(tpm_TPM.x) %>%
dplyr::select(gene_id, TPM = TPM.x, foldchange) %>%
slice_head(n = 10)
If we want to select top 10 foldchange rows, use slice_max
testes %>%
left_join(thyroid, by = 'gene_id') %>%
mutate(TPM = TPM.x, foldchange = foldchange(log(TPM.x + 1), log(TPM.y + 1)),
.keep = "unused") %>%
filter(is.finite(foldchange)) %>%
slice_max(n = 10, order_by = foldchange, with_ties = FALSE)
-output
gene_id TPM foldchange
1 gene100_C1orf233 9.45 1.786222
2 gene10000_CD86 3.09 1.434249
3 gene10007_LOC102723582 1.38 1.288517
4 gene10016_HSPBAP1 22.04 1.217311
5 gene10012_PARP15 0.59 1.162893
6 gene10005_FAM162A 120.65 1.089205
7 gene10010_PARP9 20.39 1.011953
8 gene1_WASH7P 47.56 1.008704
9 gene10011_DTX3L 13.66 -1.033968
10 gene10003_CSTA 3.73 -1.076854
The merge results in memory error because it was done on two vectors creating a cartesian join

Error while creating workflow from a recipie using linear models in R

I am training a linear regression model predicting salary from company size (company_size_number) and country (country) using the StackOverflow data.
What I perform is:
Read the data. Split the data into a training set (75%) and a test set (25%).
Create a recipe that converts company_size_number into a factor variable and then transforms the two predictors into dummy variables.
Create the model specification.
Create a workflow object and add the recipe and model specification to it, then fit the model on the training set.
Calculate R² on the test set.
This is my code
library(tidyverse)
library(tidymodels)
so <- read_rds("stackoverflow.rds")
set.seed(123)
init_split <- initial_split(so)
so_training <- training(init_split)
so_testing <- testing(init_split)
rec <- recipe(salary ~ ., data = so_training %>% select(salary, company_size_number, country)) %>%
step_num2factor(company_size_number = factor(company_size_number)) %>%
step_dummy(country, company_size_number)
model_spec <- linear_reg() %>%
set_engine("lm") %>%
set_mode("regression")
fit <- workflow() %>%
add_model(model_spec) %>%
add_recipe(rec) %>%
fit(data = so_training)
predict(fit, new_data = so_testing) %>%
mutate(truth = so_testing$salary) %>%
rmse(estimate = .pred, truth = truth)
But not able to proceed due to an error:
Error: Please provide a character vector of appropriate length for `levels`.
I presume I am messing up something here in the spec_*()
rec <- recipe(salary ~ ., data = so_training %>% select(salary, company_size_number, country)) %>%
step_novel(company_size_number = factor(company_size_number)) %>%
step_dummy(country, company_size_number)
But not sure if this correct. Any inputs would be helpful.
> dput(head(so))
structure(list(country = structure(c(5L, 5L, 4L, 4L, 5L, 5L), .Label = c("Canada",
"Germany", "India", "United Kingdom", "United States"), class = "factor"),
salary = c(63750, 93000, 40625, 45000, 1e+05, 170000), years_coded_job = c(4L,
9L, 8L, 3L, 8L, 12L), open_source = c(0, 1, 1, 1, 0, 1),
hobby = c(1, 1, 1, 0, 1, 1), company_size_number = c(20,
1000, 10000, 1, 10, 100), remote = structure(c(1L, 1L, 1L,
1L, 1L, 1L), .Label = c("Remote", "Not remote"), class = "factor"),
career_satisfaction = c(8L, 8L, 5L, 10L, 8L, 10L), data_scientist = c(0,
0, 1, 0, 0, 0), database_administrator = c(1, 0, 1, 0, 0,
0), desktop_applications_developer = c(1, 0, 1, 0, 0, 0),
developer_with_stats_math_background = c(0, 0, 0, 0, 0, 0
), dev_ops = c(0, 0, 0, 0, 0, 1), embedded_developer = c(0,
0, 0, 0, 0, 0), graphic_designer = c(0, 0, 0, 0, 0, 0), graphics_programming = c(0,
0, 0, 0, 0, 0), machine_learning_specialist = c(0, 0, 0,
0, 0, 0), mobile_developer = c(0, 1, 0, 0, 1, 0), quality_assurance_engineer = c(0,
0, 0, 0, 0, 0), systems_administrator = c(1, 0, 1, 0, 0,
1), web_developer = c(0, 0, 0, 1, 1, 1)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
I have a couple of recommendations for adjustments in what you are doing.
The first is to do the selecting of variables before splitting, so that when you use a formula like salary ~ ., you and/or the functions don't get confused about what is there.
The second is to not use step_num2factor() in the way you have; it would take a lot to get it to work correctly and I think you're better served converting it to a factor before you split. Take a look at this step's documentation to see a more appropriate use for this recipe step, and notice that you have to give it levels. This is the reason you saw the error you did, but honestly I wouldn't try to find the right levels and input them there; I'd do it before splitting.
library(tidyverse)
library(tidymodels)
data("stackoverflow", package = "modeldata")
so <- janitor::clean_names(stackoverflow)
set.seed(123)
init_split <- so %>%
select(salary, company_size_number, country) %>%
mutate(company_size_number = factor(company_size_number)) %>%
initial_split()
so_training <- training(init_split)
so_testing <- testing(init_split)
rec <- recipe(salary ~ ., data = so_training) %>%
step_dummy(country, company_size_number)
model_spec <- linear_reg() %>%
set_engine("lm") %>%
set_mode("regression")
fit <- workflow() %>%
add_model(model_spec) %>%
add_recipe(rec) %>%
fit(data = so_training)
predict(fit, new_data = so_testing) %>%
mutate(truth = so_testing$salary) %>%
rmse(estimate = .pred, truth = truth)
#> # A tibble: 1 x 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 rmse standard 27822.
Created on 2021-05-25 by the reprex package (v2.0.0)

Getting the last number in a series before lowering it below threshold in R

I have the following data:
dat<- structure(list(Pentad = 1:73, RR = c(0, 0.014285714, 0, 0.088571429,
0.071428571, 0, 0.065714286, 0.028571429, 0.094285714, 0.011428571,
0, 0, 0, 0, 0, 0, 0.04, 0, 0.814285714, 0.285714286, 1.14, 5.334285714,
2.351428571, 1.985714286, 1.494285714, 2.005714286, 20.04857143,
25.00857143, 16.32, 11.06857143, 8.965714286, 3.985714286, 5.202857143,
7.802857143, 4.451428571, 9.22, 32.04857143, 19.50571429, 3.148571429,
2.434285714, 9.057142857, 28.70857143, 34.15142857, 33.02571429,
46.50571429, 70.61714286, 3.168571429, 1.928571429, 7.031428571,
0.902857143, 5.377142857, 11.35714286, 15.04571429, 11.66285714,
21.24, 11.43714286, 11.69428571, 2.977142857, 4.337142857, 0.871428571,
1.391428571, 0.871428571, 1.145714286, 2.317142857, 0.182857143,
0.282857143, 0.348571429, 0, 0.345714286, 0.142857143, 0.18,
4.894285714, 0.037142857), YY = c(0.577142857, 0, 1.282857143,
1.445714286, 0.111428571, 0.36, 0, 0, 0, 1, 0.011428571, 0.008571429,
0.305714286, 0, 0, 0, 0, 0.8, 0.062857143, 0, 0, 0, 0, 0.013333333,
0.043333333, 1.486666667, 0, 2.486666667, 1.943333333, 0.773333333,
8.106666667, 7.733333333, 0.5, 4.356666667, 2.66, 6.626666667,
4.404285714, 7.977142857, 12.94285714, 18.49428571, 7.357142857,
11.08285714, 9.034285714, 14.29142857, 34.61428571, 45.30285714,
6.66, 6.702857143, 5.962857143, 14.85428571, 2.1, 2.837142857,
7.391428571, 32.03714286, 9.005714286, 3.525714286, 12.32, 2.32,
7.994285714, 6.565714286, 4.771428571, 2.354285714, 0.005714286,
2.508571429, 0.817142857, 2.885714286, 0.897142857, 0, 0, 0,
0, 0.145714286, 0.434285714)), class = "data.frame", row.names = c(NA,
-73L))
There are three columns: Pentad, RR, and YY.
I would like to get the following:
(a) Get the first pentad when the precipitation exceeds the "annual mean" in "at least three consecutive pentads"
(b) Get the last pentad when the precipitation exceeds the "annual mean" in at least three consecutive pentads BEFORE lowering it below the annual mean.
I was able to do (a) using the following script:
first_exceed_seq <- function(x, thresh = mean(x), len = 3)
{
# Logical vector, does x exceed the threshold
exceed_thresh <- x > thresh
# Indices of transition points; where exceed_thresh[i - 1] != exceed_thresh[i]
transition <- which(diff(c(0, exceed_thresh)) != 0)
# Reference index, grouping observations after each transition
index <- vector("numeric", length(x))
index[transition] <- 1
index <- cumsum(index)
# Break x into groups following the transitions
exceed_list <- split(exceed_thresh, index)
# Get the number of values exceeded in each index period
num_exceed <- vapply(exceed_list, sum, numeric(1))
# Get the starting index of the first sequence where more then len exceed thresh
transition[as.numeric(names(which(num_exceed >= len))[1])]
}
first_exceed_seq(dat$RR)
Here's the plot of the time series:
The correct answer in (a) is 27.
I would like to ask how can I do this for (b). The correct answer for (b) should be 57.
I'll appreciate any help on in this in R.
I don't know if I got your problem right.
This is what I tried:
dat %>%
mutate(
anual_mean = mean(RR),
exceed_thresh = RR > anual_mean,
lag1 = lag(exceed_thresh, 1),
lag2 = lag(exceed_thresh, 2),
pick_3 = ifelse(exceed_thresh & lag1 & lag2, RR, NA)
)

ggplot2 is not plotting two groups

I am having a complete brain fart right now. Why is the following code not plotting two lines for each category of the variable Ford? To my mind, I have the variable mapped as a grouping variable and then an aesthetic (col) in both geom_point() and geom_line(). I feel like I'm just overlooking something very basic.
#libraries
library(tidyverse)
#data
structure(list(stressx = c(0, 0.33, 0.67, 1, 0, 0.33, 0.67, 1,
0, 0.33, 0.67, 1, 0, 0.33, 0.67, 1), visiblex = c(0, 0, 0, 0,
1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1), ford = c(0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1), preds = c(0.166275511711196,
0.25404479263251, 0.344473748733258, 0.432243029654572, 0.417891216538386,
0.449861131692899, 0.482799832155125, 0.514769747309638, 0.166275511711196,
0.25404479263251, 0.344473748733258, 0.432243029654572, 0.417891216538386,
0.449861131692899, 0.482799832155125, 0.514769747309638), se.fit =
c(0.0216850668407667,
0.0140669010411715, 0.014932848560481, 0.0233259879905658,
0.0546688696666978,
0.034867400606124, 0.0282122239553816, 0.0418803253364085,
0.0216850668407667,
0.0140669010411715, 0.014932848560481, 0.0233259879905658,
0.0546688696666978,
0.034867400606124, 0.0282122239553816, 0.0418803253364085)), .Names =
c("stressx",
"visiblex", "ford", "preds", "se.fit"), out.attrs = structure(list(
dim = structure(c(4L, 2L, 2L), .Names = c("stressx", "visiblex",
"ford")), dimnames = structure(list(stressx = c("stressx=0.0000000",
"stressx=0.3333333", "stressx=0.6666667", "stressx=1.0000000"
), visiblex = c("visiblex=0", "visiblex=1"), ford = c("ford=0",
"ford=1")), .Names = c("stressx", "visiblex", "ford"))), .Names = c("dim",
"dimnames")), row.names = c(NA, -16L), class = "data.frame")`
My plot
newdat %>%
mutate(visiblex=recode_factor(visiblex, `0`="Not Visible Minority",
`1`="Visible Minority"), ford=recode_factor(ford, `0`="Disapprove",
`1`="Approve"), stressx=recode_factor(stressx, `0`='Strongly disagree',
`0.33`='Somewhat disagree', `0.67`='Somewhat agree', `1`='Strongly agree'))
%>%
rename(Stress=stressx, Visible=visiblex, Ford=ford, Prob=preds) %>%
#filter(Ford=='Approve') %>%
ggplot(., aes(x=Stress, y=Prob, group=Ford))+
geom_point(aes(col=Ford))+
geom_line(aes(col=Ford))+
facet_wrap(~Visible)+
ylim(c(0,1))+
theme(axis.text.x=element_text(angle=45, vjust=0.5))`
It's because you have identical data points for both levels of the factor variable Ford. I have modified your code slightly to show the data and then plotted the data with geom_jitter instead of geom_point and now you can see both data points. Since the underlying datapoints are identical, the lines drawn through those data points are also overlapping and only one of them is visible.
#libraries
library(tidyverse)
#data
newdat <- structure(
list(
stressx = c(0, 0.33, 0.67, 1, 0, 0.33, 0.67, 1,
0, 0.33, 0.67, 1, 0, 0.33, 0.67, 1),
visiblex = c(0, 0, 0, 0,
1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1),
ford = c(0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1),
preds = c(
0.166275511711196,
0.25404479263251,
0.344473748733258,
0.432243029654572,
0.417891216538386,
0.449861131692899,
0.482799832155125,
0.514769747309638,
0.166275511711196,
0.25404479263251,
0.344473748733258,
0.432243029654572,
0.417891216538386,
0.449861131692899,
0.482799832155125,
0.514769747309638
),
se.fit =
c(
0.0216850668407667,
0.0140669010411715,
0.014932848560481,
0.0233259879905658,
0.0546688696666978,
0.034867400606124,
0.0282122239553816,
0.0418803253364085,
0.0216850668407667,
0.0140669010411715,
0.014932848560481,
0.0233259879905658,
0.0546688696666978,
0.034867400606124,
0.0282122239553816,
0.0418803253364085
)
),
.Names =
c("stressx",
"visiblex", "ford", "preds", "se.fit"),
out.attrs = structure(
list(
dim = structure(c(4L, 2L, 2L), .Names = c("stressx", "visiblex",
"ford")),
dimnames = structure(
list(
stressx = c(
"stressx=0.0000000",
"stressx=0.3333333",
"stressx=0.6666667",
"stressx=1.0000000"
),
visiblex = c("visiblex=0", "visiblex=1"),
ford = c("ford=0",
"ford=1")
),
.Names = c("stressx", "visiblex", "ford")
)
),
.Names = c("dim",
"dimnames")
),
row.names = c(NA, -16L),
class = "data.frame"
)
#my plot
data <- newdat %>%
mutate(
visiblex = recode_factor(visiblex, `0` = "Not Visible Minority",
`1` = "Visible Minority"),
ford = recode_factor(ford, `0` = "Disapprove",
`1` =
"Approve"),
stressx = recode_factor(
stressx,
`0` = 'Strongly disagree',
`0.33` =
'Somewhat disagree',
`0.67` = 'Somewhat agree',
`1` = 'Strongly agree'
)
) %>%
dplyr::rename(
Stress = stressx,
Visible = visiblex,
Ford = ford,
Prob = preds
)
# display data
data
#> Stress Visible Ford Prob se.fit
#> 1 Strongly disagree Not Visible Minority Disapprove 0.1662755 0.02168507
#> 2 Somewhat disagree Not Visible Minority Disapprove 0.2540448 0.01406690
#> 3 Somewhat agree Not Visible Minority Disapprove 0.3444737 0.01493285
#> 4 Strongly agree Not Visible Minority Disapprove 0.4322430 0.02332599
#> 5 Strongly disagree Visible Minority Disapprove 0.4178912 0.05466887
#> 6 Somewhat disagree Visible Minority Disapprove 0.4498611 0.03486740
#> 7 Somewhat agree Visible Minority Disapprove 0.4827998 0.02821222
#> 8 Strongly agree Visible Minority Disapprove 0.5147697 0.04188033
#> 9 Strongly disagree Not Visible Minority Approve 0.1662755 0.02168507
#> 10 Somewhat disagree Not Visible Minority Approve 0.2540448 0.01406690
#> 11 Somewhat agree Not Visible Minority Approve 0.3444737 0.01493285
#> 12 Strongly agree Not Visible Minority Approve 0.4322430 0.02332599
#> 13 Strongly disagree Visible Minority Approve 0.4178912 0.05466887
#> 14 Somewhat disagree Visible Minority Approve 0.4498611 0.03486740
#> 15 Somewhat agree Visible Minority Approve 0.4827998 0.02821222
#> 16 Strongly agree Visible Minority Approve 0.5147697 0.04188033
# plot the data
data %>%
#filter(Ford=='Approve') %>%
ggplot2::ggplot(data = .,
mapping = aes(x = Stress, y = Prob, group = Ford, colour = Ford)) +
ggplot2::geom_jitter() + # change this back geom_point()
ggplot2::geom_line() +
ggplot2::facet_wrap( ~ Visible) +
ggplot2::scale_y_continuous(limits = c(0, 1)) +
ggplot2::theme(axis.text.x = element_text(angle = 45, vjust = 0.5))
Created on 2018-03-13 by the reprex package (v0.2.0).

How to properly index list items to return rows, not columns, inside a for loop

I'm trying to write a for loop within another for loop. The first loop grabs the ith vcov matrix from a list of variously sized matrices (vcmats below) and grabs a frame of 24 predictor models of appropriate dimension to multiply with the current vcov matrix from a list of frames (jacobians below) for the different models. The second loop should pull the jth record (row) from the selected predictor frame, correctly format it, then run the calculation with the vcov matrix and output an indicator variable and calculated result needed for post processing to the holding table (holdtab).
When I run the code below I get the following error: Error in jjacob[, 1:4] : incorrect number of dimensions because R is returning the column of 1s (i.e. the intercept column of jacobs), not the complete first record (i.e. jjacob = jacobs[1,]). I've substantially simplified the example but left enough complexity to demonstrate the problem. I would appreciate any help in resolving this issue.
vcmats <- list(structure(c(0.67553, -0.1932, -0.00878, -0.00295, -0.00262,
-0.00637, -0.1932, 0.19988, 0.00331, -0.00159, 0.00149, 2e-05,
-0.00878, 0.00331, 0.00047, -6e-05, 3e-05, 3e-05, -0.00295, -0.00159,
-6e-05, 0.00013, -2e-05, 6e-05, -0.00262, 0.00149, 3e-05, -2e-05,
2e-05, 0, -0.00637, 2e-05, 3e-05, 6e-05, 0, 0.00026), .Dim = c(6L,
6L)), structure(c(0.38399, -0.03572, -0.00543, -0.00453, -0.00634,
-0.03572, 0.10912, 0.00118, -0.00044, 0.00016, -0.00543, 0.00118,
0.00042, -3e-05, 4e-05, -0.00453, -0.00044, -3e-05, 0.00011,
5e-05, -0.00634, 0.00016, 4e-05, 5e-05, 0.00025), .Dim = c(5L,
5L)))
jacobians <- list(structure(list(intcpt = c(1, 1, 1, 1), species = c(1, 1,
0, 0), nage = c(6, 6, 6, 6), T = c(12, 50, 12, 50), hgt = c(90,
90, 90, 90), moon = c(7, 7, 7, 7), hXm = c(0, 0, 0, 0), covr = c(0,
0, 0, 0), het = c(0, 0, 0, 0)), .Names = c("intcpt", "species",
"nage", "T", "hgt", "moon", "hXm", "covr", "het"), row.names = c("1",
"1.4", "1.12", "1.16"), class = "data.frame"), structure(list(
intcpt = c(1, 1, 1, 1), species = c(1, 1, 0, 0), nage = c(6,
6, 6, 6), T = c(12, 50, 12, 50), hgt = c(0, 0, 0, 0), moon = c(7,
7, 7, 7), hXm = c(0, 0, 0, 0), covr = c(0, 0, 0, 0), het = c(0,
0, 0, 0)), .Names = c("intcpt", "species", "nage", "T", "hgt",
"moon", "hXm", "covr", "het"), row.names = c("2", "2.4", "2.12",
"2.16"), class = "data.frame"))
holdtab <- structure(list(model = structure(c(4L, 4L, 4L, 4L, 5L, 5L, 5L,
5L), .Label = c("M.1.BaseCov", "M.2.Height", "M.5.Height.X.LastNewMoon",
"M.6.Height.plus.LastNew", "M.7.LastNewMoon", "M.G.Global"), class = "factor"),
aicc = c(341.317, 341.317, 341.317, 341.317, 342.1412, 342.1412,
342.1412, 342.1412), species = c(NA, NA, NA, NA, NA, NA,
NA, NA), condVar = c(NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("model",
"aicc", "species", "condVar"), row.names = c(1L, 2L, 3L, 4L,
25L, 26L, 27L, 28L), class = "data.frame")
jloop <- 1
for (imat in vcmats) { # Call the outside loop of vcov matrices
jacobs = jacobians[[jloop]] # Set tempvar jacobs as the jth member of the jacobians frame (n/24)
for (jjacob in jacobs) { # Call inside loop of lines in jacob (each individual set of predictor levels)
# I need to reduce the vector length to match my vcov matrix so
pt1 = jjacob[,1:4] # Separate Core columns from variable columns (because I don't want to drop species when ==0)
pt2 = jjacob[,5:9] # Pull out variable columns for next step
pt2 = pt2[,!apply(pt2 == 0, 2, all)] # Drop any variable columns that ==0
jjacob = cbind(pt1, pt2) # Reconstruct the record now of correct dimensions for the relevant vcov matrix
jjacob = as.matrix(jjacob) # Explicitly convert jjmod - I was having trouble with this previously
tj = (t(jjacob)) # Transpose the vector
condvar = jjacob %*% imat %*% tj # run the calculation
condVarTab[record,3] = jjacob[2] # Write species 0 or 1 to the output table
condVarTab[record,4] = condvar # Write the conditional variance to the table
record = record+1 # Iterate the record number for the next output run
}
jloop = jloop+1 # Once all 24 models in a frame are calculated iterate to the next frame of models which will be associated with a new vcv matrix
}

Resources