Entering data into multiple dataframes - r

I have two loops, the first looks at a specific protein then the second looks at specific cells. In addition I have 16 tables (all named the protein then "_table"). If I specify the cell type I can get the data to enter into the correct row but if I try paste0(temp_TSPAN, "_table") I get the error incorrect number of subscripts on matrix
Any ideas how I can get my loop to specify the correct table?
Here is the second loop which codes for putting data into the tables:
temp_cell <- xCell_cells[i]
print(temp_TSPAN)
print(temp_cell)
temp_means <- c(mean(xCell_Lum_A_Q1[,temp_cell]),mean(xCell_Lum_A_Q2[,temp_cell]),mean(xCell_Lum_A_Q3[,temp_cell]),mean(xCell_Lum_A_Q4[,temp_cell]),
mean(xCell_Lum_B_Q1[,temp_cell]),mean(xCell_Lum_B_Q2[,temp_cell]),mean(xCell_Lum_B_Q3[,temp_cell]),mean(xCell_Lum_B_Q4[,temp_cell]),
mean(xCell_Her_2_Q1[,temp_cell]),mean(xCell_Her_2_Q2[,temp_cell]),mean(xCell_Her_2_Q3[,temp_cell]),mean(xCell_Her_2_Q4[,temp_cell]),
mean(xCell_Basal_Q1[,temp_cell]),mean(xCell_Basal_Q2[,temp_cell]),mean(xCell_Basal_Q3[,temp_cell]),mean(xCell_Basal_Q4[,temp_cell]),
mean(xCell_Normal_Q1[,temp_cell]),mean(xCell_Normal_Q2[,temp_cell]),mean(xCell_Normal_Q3[,temp_cell]),mean(xCell_Normal_Q4[,temp_cell]))
print(temp_means)
paste0(temp_TSPAN, "_table")[temp_cell,] <- temp_means
temp_means <- c()
}
The entire code
library(dplyr)
library(RColorBrewer)
RNA_seq <- read.table("RNASeq2Norm_expr_BCRA.txt", stringsAsFactors = F)
xCell <- read.table("..../xCell_ES_RNAseq.txt", stringsAsFactors = F)
PAM50 <- read.table("..../PAM50_subtypes.txt", stringsAsFactors = F)
TSPANS <- read.table("..../TSPANS.txt", stringsAsFactors = F)
len_TSPAN <- length(TSPANS$V1)
col <- brewer.pal(4, "Pastel1")
xCell_cells <- rownames(xCell)
#Create table for quartile means to be entered
for (i in seq(1,len_TSPAN)){
temp_TSPAN <- TSPANS$V1[i]
print(temp_TSPAN)
assign(as.character((temp_TSPAN)), value = data.frame(Lum_A_Q1_means = rep(NA, 67), Lum_A_Q2_means = rep(NA,67),
Lum_A_Q3_means = rep(NA, 67), Lum_A_Q4_means = rep(NA,67),
Lum_B_Q1_means = rep(NA, 67), Lum_B_Q2_means = rep(NA,67),
Lum_B_Q3_means = rep(NA, 67), Lum_B_Q4_means = rep(NA,67),
Her_2_Q1_means = rep(NA, 67), Her_2_Q2_means = rep(NA,67),
Her_2_Q3_means = rep(NA, 67), Her_2_Q4_means = rep(NA,67),
Basal_Q1_means = rep(NA, 67), Basal_Q2_means = rep(NA,67),
Basal_Q3_means = rep(NA, 67), Basal_Q4_means = rep(NA,67),
Normal_Q1_means = rep(NA, 67), Normal_Q2_means = rep(NA,67),
Normal_Q3_means = rep(NA, 67), Normal_Q4_means = rep(NA,67),
row.names = xCell_cells))
}
temp_TSPAN <- c()
temp_cell <- c()
#Determine which samples belong to each quartile
for (T in seq(1,len_TSPAN)) {
temp_TSPAN <- TSPANS$V1[T]
print(temp_TSPAN)
Lum_A <- RNA_seq[temp_TSPAN, PAM50$subtype == "LumA"]
Lum_A_Quartiles <- quantile(Lum_A[temp_TSPAN,])
Q1_Lum_A <- Lum_A[,(Lum_A[temp_TSPAN,]) <= Lum_A_Quartiles$`25%`]
Q2_Lum_A <- Lum_A[,(Lum_A[temp_TSPAN,]) > Lum_A_Quartiles$`25%`]
Q2_Lum_A <- Q2_Lum_A[,(Q2_Lum_A[temp_TSPAN,]) <= Lum_A_Quartiles$`50%`]
Q3_Lum_A <- Lum_A[,(Lum_A[temp_TSPAN,]) > Lum_A_Quartiles$`50%`]
Q3_Lum_A <- Q3_Lum_A[,(Q3_Lum_A[temp_TSPAN,]) <= Lum_A_Quartiles$`75%`]
Q4_Lum_A <- Lum_A[,(Lum_A[temp_TSPAN,]) > Lum_A_Quartiles$`75%`]
Lum_B <- RNA_seq[temp_TSPAN, PAM50$subtype == "LumB"]
Lum_B_Quartiles <- quantile(Lum_B[temp_TSPAN,])
Q1_Lum_B <- Lum_B[,(Lum_B[temp_TSPAN,]) <= Lum_B_Quartiles$`25%`]
Q2_Lum_B <- Lum_B[,(Lum_B[temp_TSPAN,]) > Lum_B_Quartiles$`25%`]
Q2_Lum_B <- Q2_Lum_B[,(Q2_Lum_B[temp_TSPAN,]) <= Lum_B_Quartiles$`50%`]
Q3_Lum_B <- Lum_B[,(Lum_B[temp_TSPAN,]) > Lum_B_Quartiles$`50%`]
Q3_Lum_B <- Q3_Lum_B[,(Q3_Lum_B[temp_TSPAN,]) <= Lum_B_Quartiles$`75%`]
Q4_Lum_B <- Lum_B[,(Lum_B[temp_TSPAN,]) > Lum_B_Quartiles$`75%`]
Her_2 <- RNA_seq[temp_TSPAN, PAM50$subtype == "Her2"]
Her_2_Quartiles <- quantile(Her_2[temp_TSPAN,])
Q1_Her_2 <- Her_2[,(Her_2[temp_TSPAN,]) <= Her_2_Quartiles$`25%`]
Q2_Her_2 <- Her_2[,(Her_2[temp_TSPAN,]) > Her_2_Quartiles$`25%`]
Q2_Her_2 <- Q2_Her_2[,(Q2_Her_2[temp_TSPAN,]) <= Her_2_Quartiles$`50%`]
Q3_Her_2 <- Her_2[,(Her_2[temp_TSPAN,]) > Her_2_Quartiles$`50%`]
Q3_Her_2 <- Q3_Her_2[,(Q3_Her_2[temp_TSPAN,]) <= Her_2_Quartiles$`75%`]
Q4_Her_2 <- Her_2[,(Her_2[temp_TSPAN,]) > Her_2_Quartiles$`75%`]
Basal <- RNA_seq[temp_TSPAN, PAM50$subtype == "Basal"]
Basal_Quartiles <- quantile(Basal[temp_TSPAN,])
Q1_Basal <- Basal[,(Basal[temp_TSPAN,]) <= Basal_Quartiles$`25%`]
Q2_Basal <- Basal[,(Basal[temp_TSPAN,]) > Basal_Quartiles$`25%`]
Q2_Basal <- Q2_Basal[,(Q2_Basal[temp_TSPAN,]) <= Basal_Quartiles$`50%`]
Q3_Basal <- Basal[,(Basal[temp_TSPAN,]) > Basal_Quartiles$`50%`]
Q3_Basal <- Q3_Basal[,(Q3_Basal[temp_TSPAN,]) <= Basal_Quartiles$`75%`]
Q4_Basal <- Basal[,(Basal[temp_TSPAN,]) > Basal_Quartiles$`75%`]
Normal <- RNA_seq[temp_TSPAN, PAM50$subtype == "Normal"]
Normal_Quartiles <- quantile(Normal[temp_TSPAN,])
Q1_Normal <- Normal[,(Normal[temp_TSPAN,]) <= Normal_Quartiles$`25%`]
Q2_Normal <- Normal[,(Normal[temp_TSPAN,]) > Normal_Quartiles$`25%`]
Q2_Normal <- Q2_Normal[,(Q2_Normal[temp_TSPAN,]) <= Normal_Quartiles$`50%`]
Q3_Normal <- Normal[,(Normal[temp_TSPAN,]) > Normal_Quartiles$`50%`]
Q3_Normal <- Q3_Normal[,(Q3_Normal[temp_TSPAN,]) <= Normal_Quartiles$`75%`]
Q4_Normal <- Normal[,(Normal[temp_TSPAN,]) > Normal_Quartiles$`75%`]
Lum_A_Q1_samples <- colnames(Q1_Lum_A)
Lum_A_Q2_samples <- colnames(Q2_Lum_A)
Lum_A_Q3_samples <- colnames(Q3_Lum_A)
Lum_A_Q4_samples <- colnames(Q4_Lum_A)
Lum_B_Q1_samples <- colnames(Q1_Lum_B)
Lum_B_Q2_samples <- colnames(Q2_Lum_B)
Lum_B_Q3_samples <- colnames(Q3_Lum_B)
Lum_B_Q4_samples <- colnames(Q4_Lum_B)
Her_2_Q1_samples <- colnames(Q1_Her_2)
Her_2_Q2_samples <- colnames(Q2_Her_2)
Her_2_Q3_samples <- colnames(Q3_Her_2)
Her_2_Q4_samples <- colnames(Q4_Her_2)
Basal_Q1_samples <- colnames(Q1_Basal)
Basal_Q2_samples <- colnames(Q2_Basal)
Basal_Q3_samples <- colnames(Q3_Basal)
Basal_Q4_samples <- colnames(Q4_Basal)
Normal_Q1_samples <- colnames(Q1_Normal)
Normal_Q2_samples <- colnames(Q2_Normal)
Normal_Q3_samples <- colnames(Q3_Normal)
Normal_Q4_samples <- colnames(Q4_Normal)
#Finding enrichment scores for the samples in each quartile
xCell_Lum_A_Q1 <- t(xCell[,Lum_A_Q1_samples])
xCell_Lum_A_Q2 <- t(xCell[,Lum_A_Q2_samples])
xCell_Lum_A_Q3 <- t(xCell[,Lum_A_Q3_samples])
xCell_Lum_A_Q4 <- t(xCell[,Lum_A_Q4_samples])
xCell_Lum_B_Q1 <- t(xCell[,Lum_B_Q1_samples])
xCell_Lum_B_Q2 <- t(xCell[,Lum_B_Q2_samples])
xCell_Lum_B_Q3 <- t(xCell[,Lum_B_Q3_samples])
xCell_Lum_B_Q4 <- t(xCell[,Lum_B_Q4_samples])
xCell_Her_2_Q1 <- t(xCell[,Her_2_Q1_samples])
xCell_Her_2_Q2 <- t(xCell[,Her_2_Q2_samples])
xCell_Her_2_Q3 <- t(xCell[,Her_2_Q3_samples])
xCell_Her_2_Q4 <- t(xCell[,Her_2_Q4_samples])
xCell_Basal_Q1 <- t(xCell[,Basal_Q1_samples])
xCell_Basal_Q2 <- t(xCell[,Basal_Q2_samples])
xCell_Basal_Q3 <- t(xCell[,Basal_Q3_samples])
xCell_Basal_Q4 <- t(xCell[,Basal_Q4_samples])
xCell_Normal_Q1 <- t(xCell[,Normal_Q1_samples])
xCell_Normal_Q2 <- t(xCell[,Normal_Q2_samples])
xCell_Normal_Q3 <- t(xCell[,Normal_Q3_samples])
xCell_Normal_Q4 <- t(xCell[,Normal_Q4_samples])
len_xCell <- length(xCell_cells)
temp_means <- c()
for (i in seq(1, len_xCell)){
temp_cell <- xCell_cells[i]
print(temp_TSPAN)
print(temp_cell)
temp_means <- c(mean(xCell_Lum_A_Q1[,temp_cell]),mean(xCell_Lum_A_Q2[,temp_cell]),mean(xCell_Lum_A_Q3[,temp_cell]),mean(xCell_Lum_A_Q4[,temp_cell]),
mean(xCell_Lum_B_Q1[,temp_cell]),mean(xCell_Lum_B_Q2[,temp_cell]),mean(xCell_Lum_B_Q3[,temp_cell]),mean(xCell_Lum_B_Q4[,temp_cell]),
mean(xCell_Her_2_Q1[,temp_cell]),mean(xCell_Her_2_Q2[,temp_cell]),mean(xCell_Her_2_Q3[,temp_cell]),mean(xCell_Her_2_Q4[,temp_cell]),
mean(xCell_Basal_Q1[,temp_cell]),mean(xCell_Basal_Q2[,temp_cell]),mean(xCell_Basal_Q3[,temp_cell]),mean(xCell_Basal_Q4[,temp_cell]),
mean(xCell_Normal_Q1[,temp_cell]),mean(xCell_Normal_Q2[,temp_cell]),mean(xCell_Normal_Q3[,temp_cell]),mean(xCell_Normal_Q4[,temp_cell]))
print(temp_means)
nm1 <- temp_TSPAN, "_table")
assign(nm1, `[<-`(get(nm1), get(nm1)[temp_cell,], temp_means))
temp_means <- c()
}
}
Sample data
> dput(head(TSPANS))
structure(list(V1 = c("TSPAN1", "TSPAN3", "TSPAN4", "TSPAN6",
"TSPAN8", "TSPAN9")), row.names = c(NA, 6L), class = "data.frame")
> dput(head(PAM50))
structure(list(Sample_ID = c("TCGA.3C.AAAU.01A.11R.A41B.07",
"TCGA.3C.AALI.01A.11R.A41B.07", "TCGA.3C.AALJ.01A.31R.A41B.07",
"TCGA.3C.AALK.01A.11R.A41B.07", "TCGA.4H.AAAK.01A.12R.A41B.07",
"TCGA.5L.AAT0.01A.12R.A41B.07"), subtype = c("LumB", "Her2",
"LumB", "Her2", "LumB", "LumA")), row.names = c("1", "2", "3",
"4", "5", "6"), class = "data.frame")
> dput(xCell[1:5, 1:5])
structure(list(TCGA.3C.AAAU.01A.11R.A41B.07 = c(0.0182278777214451,
0, 0, 0.00312390016077943, 0.136068543973221), TCGA.3C.AALI.01A.11R.A41B.07 = c(0.282595778602895,
0, 0.0600603500818251, 0.0589537608635649, 0.205506668589802),
TCGA.3C.AALJ.01A.31R.A41B.07 = c(0.18283171431184, 0.0941680866198556,
0.146150110122777, 0.0304405814585031, 8.9658687089931e-20
), TCGA.3C.AALK.01A.11R.A41B.07 = c(0.134145304728982, 0.032112973032126,
0.154386799682783, 0, 4.17812708486922e-20), TCGA.4H.AAAK.01A.12R.A41B.07 = c(0.106111324096064,
0.0121130054841642, 0.191944288358642, 0, 0.125099426066817
)), row.names = c("aDC", "Adipocytes", "Astrocytes", "B-cells",
"Basophils"), class = "data.frame")
> dput(RNA_seq[1:5, 1:5])
structure(list(TCGA.3C.AAAU.01A.11R.A41B.07 = c(197.0897, 0,
0, 102.9634, 1.3786), TCGA.3C.AALI.01A.11R.A41B.07 = c(237.3844,
0, 0, 70.8646, 4.3502), TCGA.3C.AALJ.01A.31R.A41B.07 = c(423.2366,
0.9066, 0, 161.2602, 0), TCGA.3C.AALK.01A.11R.A41B.07 = c(191.0178,
0, 0, 62.5072, 1.6549), TCGA.4H.AAAK.01A.12R.A41B.07 = c(268.8809,
0.4255, 3.8298, 154.3702, 3.4043)), row.names = c("A1BG", "A1CF",
"A2BP1", "A2LD1", "A2ML1"), class = "data.frame")
> dput(head(TSPAN1_table))
structure(list(Lum_A_Q1_means = c(NA, NA, NA, NA, NA, NA), Lum_A_Q2_means = c(NA,
NA, NA, NA, NA, NA), Lum_A_Q3_means = c(NA, NA, NA, NA, NA, NA
), Lum_A_Q4_means = c(NA, NA, NA, NA, NA, NA), Lum_B_Q1_means = c(NA,
NA, NA, NA, NA, NA), Lum_B_Q2_means = c(NA, NA, NA, NA, NA, NA
), Lum_B_Q3_means = c(NA, NA, NA, NA, NA, NA), Lum_B_Q4_means = c(NA,
NA, NA, NA, NA, NA), Her_2_Q1_means = c(NA, NA, NA, NA, NA, NA
), Her_2_Q2_means = c(NA, NA, NA, NA, NA, NA), Her_2_Q3_means = c(NA,
NA, NA, NA, NA, NA), Her_2_Q4_means = c(NA, NA, NA, NA, NA, NA
), Basal_Q1_means = c(NA, NA, NA, NA, NA, NA), Basal_Q2_means = c(NA,
NA, NA, NA, NA, NA), Basal_Q3_means = c(NA, NA, NA, NA, NA, NA
), Basal_Q4_means = c(NA, NA, NA, NA, NA, NA), Normal_Q1_means = c(NA,
NA, NA, NA, NA, NA), Normal_Q2_means = c(NA, NA, NA, NA, NA,
NA), Normal_Q3_means = c(NA, NA, NA, NA, NA, NA), Normal_Q4_means = c(NA,
NA, NA, NA, NA, NA)), row.names = c("aDC", "Adipocytes", "Astrocytes",
"B-cells", "Basophils", "CD4+ memory T-cells"), class = "data.frame")

We need to get the value with get an assign using assign
temp_cell <- xCell_cells[i]
print(temp_TSPAN)
print(temp_cell)
temp_means <- c(mean(xCell_Lum_A_Q1[,temp_cell]),mean(xCell_Lum_A_Q2[,temp_cell]),mean(xCell_Lum_A_Q3[,temp_cell]),mean(xCell_Lum_A_Q4[,temp_cell]),
mean(xCell_Lum_B_Q1[,temp_cell]),mean(xCell_Lum_B_Q2[,temp_cell]),mean(xCell_Lum_B_Q3[,temp_cell]),mean(xCell_Lum_B_Q4[,temp_cell]),
mean(xCell_Her_2_Q1[,temp_cell]),mean(xCell_Her_2_Q2[,temp_cell]),mean(xCell_Her_2_Q3[,temp_cell]),mean(xCell_Her_2_Q4[,temp_cell]),
mean(xCell_Basal_Q1[,temp_cell]),mean(xCell_Basal_Q2[,temp_cell]),mean(xCell_Basal_Q3[,temp_cell]),mean(xCell_Basal_Q4[,temp_cell]),
mean(xCell_Normal_Q1[,temp_cell]),mean(xCell_Normal_Q2[,temp_cell]),mean(xCell_Normal_Q3[,temp_cell]),mean(xCell_Normal_Q4[,temp_cell]))
print(temp_means)
nm1 <- temp_TSPAN, "_table")
assign(nm1, `[<-`(get(nm1), get(nm1)[temp_cell,], temp_means))
temp_means <- c()
}
May be the OP is looking for simplified version with
lapply(split(RNA_seq, setNames(PAM50$subtype, PAM50$Sample_ID)[colnames(RNA_seq)]),
function(dat) apply(dat, 1, function(x) {
qnt <- quantile(x)
data.frame(val = names(x), grp = names(qnt)[findInterval(x, qnt)])
apply(xCell[, names(x)], 2, function(y) tapply(y, names(x), FUN = mean))
}))

Related

Error message in R: not a symmetric or triangular matrix

I am trying to convert a correlation matrix to a covariance matrix using cor2cov in R.
library(MBESS)
eff_1971 <- c(NA, .56, .25, .25, .22, -.47, -.01, -.06)
eff_1972 <- c(NA, NA, .23, .23, .25, .47, -.01, .03)
annual_earnings_1970 <- c(NA, NA, NA, .88, .83, -.02, -.28, -.14)
annual_earnings_1971 <- c(NA, NA, NA, NA, .88, -.02, .21, -.29)
annual_earnings_1972 <- c(NA, NA, NA, NA, NA, .03, .06, .21)
change_eff_1971_1972 <- c(NA, NA, NA, NA, NA, NA, 0.0, .1)
change_ann_earn_1970_1971 <- c(NA, NA, NA, NA, NA, NA, NA, -.29)
change_ann_earn_1971_1972 <- c(NA, NA, NA, NA, NA, NA, NA, NA)
df <- data.frame(eff_1971,
eff_1972,
annual_earnings_1970,
annual_earnings_1971,
annual_earnings_1972,
change_eff_1971_1972,
change_ann_earn_1970_1971,
change_ann_earn_1971_1972)
df <- as.matrix(df)
sd <- c(.82, .82, .52, .51, .50, .77, .25, .25)
cor2cov(df, sd)
However, I get this error message:
Error in cor2cov(df, sd) :
The object 'cor.mat' should be either a symmetric or a triangular matrix
Does anyone know how I can fix this error?
Thank you!
You can make df triangular by setting the diag to 1 and the upper values to 0
diag(df) <- 1
df[is.na(df)] <- 0

linear regression model with dplyr on sepcified columns by name

I have the following data frame, each row containing four dates ("y") and four measurements ("x"):
df = structure(list(x1 = c(69.772808673525, NA, 53.13125414839,
17.3033274666411,
NA, 38.6120670385487, 57.7229000792707, 40.7654208618078, 38.9010405201831,
65.7108936694177), y1 = c(0.765671296296296, NA, 1.37539351851852,
0.550277777777778, NA, 0.83037037037037, 0.0254398148148148,
0.380671296296296, 1.368125, 2.5250462962963), x2 = c(81.3285388496182,
NA, NA, 44.369872853302, NA, 61.0746827226573, 66.3965114460601,
41.4256874481852, 49.5461413070349, 47.0936997726146), y2 =
c(6.58287037037037,
NA, NA, 9.09377314814815, NA, 7.00127314814815, 6.46597222222222,
6.2462962962963, 6.76976851851852, 8.12449074074074), x3 = c(NA,
60.4976916064608, NA, 45.3575294731303, 45.159758146854, 71.8459173097114,
NA, 37.9485456227131, 44.6307631013742, 52.4523342186143), y3 = c(NA,
12.0026157407407, NA, 13.5601157407407, 16.1213657407407, 15.6431018518519,
NA, 15.8986805555556, 13.1395138888889, 17.9432638888889), x4 = c(NA,
NA, NA, 57.3383407228293, NA, 59.3921356160536, 67.4231673171527,
31.853845252547, NA, NA), y4 = c(NA, NA, NA, 18.258125, NA,
19.6074768518519,
20.9696527777778, 23.7176851851852, NA, NA)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -10L))
I would like to create an additional column containing the slope of all the y's versus all the x's, for each row (each row is a patient with these 4 measurements).
Here is what I have so far:
df <- df %>% mutate(Slope = lm(vars(starts_with("y") ~
vars(starts_with("x"), data = .)
I am getting an error:
invalid type (list) for variable 'vars(starts_with("y"))'...
What am I doing wrong, and how can I calculate the rowwise slope?
You are using a tidyverse syntax but your data is not tidy...
Maybe you should rearrange your data.frame and rethink the way you store your data.
Here is how to do it in a quick and dirty way (at least if I understood your explanations correctly):
df <- merge(reshape(df[,(1:4)*2-1], dir="long", varying = list(1:4), v.names = "x", idvar = "patient"),
reshape(df[,(1:4)*2], dir="long", varying = list(1:4), v.names = "y", idvar = "patient"))
df$patient <- factor(df$patient)
Then you could loop over the patients, perform a linear regression and get the slopes as a vector:
sapply(levels(df$patient), function(pat) {
coef(lm(y~x,df[df$patient==pat,],na.action = "na.omit"))[2]
})

R: Pearson correlation in a loop, prevent stopping when an error occurs and output NAs

I want to run Pearson correlations of each row of a matrix (dat) vs a vector (v1), as part of a loop, and output the correlation coefficients and associated p-values in a table. Here is an example for random data (data pasted at the end):
result_table <- data.frame(matrix(ncol = 2, nrow = nrow(dat)))
colnames(result_table) <- c("correlation_coefficient", "pvalue")
for(i in 1:nrow(dat)){
print(i)
corr <- cor.test(as.numeric(dat[i,]), v1, na.action = "na.omit")
result_table[i,1] <- corr$estimate
result_table[i,2] <- corr$p.value
}
When cor.test() removes missing data, sometimes there are not enough observations remaining and the loop stops with an error (for example at row 11). I would like the loop to continue running, just leaving the values in the result table as NAs. I think the result table should then look like this:
> result_table
correlation_coefficient pvalue
1 0.68422642 0.04206591
2 -0.15895586 0.70694013
3 -0.37005028 0.53982309
4 0.08448970 0.89255250
5 0.86860091 0.05603661
6 0.19544883 0.75274040
7 -0.94695380 0.01454887
8 -0.03817885 0.94275955
9 -0.15214122 0.77354897
10 -0.22997890 0.70978386
11 NA NA
12 NA NA
13 -0.27769887 0.59415930
14 -0.09768153 0.81800885
15 -0.20986632 0.61790214
16 -0.40474976 0.31990456
17 -0.00605937 0.98863896
18 0.02176976 0.95919460
19 -0.14755097 0.72733118
20 -0.25830856 0.50216600
I would also like the errors to keep being printed
Here is the data:
> dput(v1)
c(-0.840396, 0.4746047, -1.101857, 0.5164767, 1.2203134, -0.9758888,
-0.3657913, -0.6272523, -0.5853803, 1.7367901)
> dput(dat)
structure(list(s1 = c(-0.52411895, 0.14709633, 0.05433954, 0.7504406,
-0.59971988, -0.59679685, -0.12571854, 0.73289705, -0.71668771,
-0.04813957, -0.67849896, -0.11947141, -0.26371884, -1.34137162,
2.60928064, -1.23397547, 0.51811222, -4.10759883, -0.70127093,
7.51914575), s2 = c(0.21446623, -0.27281487, NA, NA, NA, NA,
NA, NA, -0.62468391, NA, NA, NA, -3.84387999, 0.64010069, NA,
NA, NA, NA, NA, NA), s3 = c(0.3461212, 0.279062, NA, NA, NA,
-0.4737744, 0.6313365, -2.8472641, 1.2647846, 2.2524449, -0.7913039,
-0.752590307, -3.535815266, 1.692385187, 3.55789764, -1.694910854,
-3.624517121, -4.963855198, 2.395998161, 5.35680032), s4 = c(0.3579742,
0.3522745, -1.1720907, 0.4223402, 0.146605, -0.3175295, -1.383926807,
-0.688551166, NA, NA, NA, NA, NA, 0.703612974, 1.79890268, -2.625404608,
-3.235884921, -2.845474098, 0.058650461, 1.83900702), s5 = c(1.698104376,
NA, NA, NA, NA, NA, -1.488000007, -0.739488766, 0.276012387,
0.49344994, NA, NA, -1.417434166, -0.644962513, 0.04010434, -3.388182254,
2.900252493, -1.493417096, -2.852256003, -0.98871696), s6 = c(0.3419271,
0.2482013, -1.2230283, 0.270752, -0.6653978, -1.1357202, NA,
NA, NA, NA, NA, NA, NA, NA, -1.0288213, -1.17817328, 6.1682455,
1.02759131, -3.80372867, -2.6249692), s7 = c(0.3957243, 0.8758406,
NA, NA, NA, NA, NA, 0.60196247, -1.28631859, -0.5754757, NA,
NA, NA, NA, NA, NA, NA, NA, NA, -2.6303001), s8 = c(-0.26409595,
1.2643281, 0.05687957, -0.09459169, -0.7875279, NA, NA, NA, NA,
NA, NA, NA, 2.42442997, -0.00445559, -1.0341522, 2.47315322,
0.1190265, 5.82533417, 0.82239131, -0.8279679), s9 = c(0.237123,
-0.5004619, 0.4447322, -0.2155249, -0.2331443, 1.3438071, -0.3817672,
1.9228182, 0.305661, -0.01348, NA, NA, 3.4009042, 0.8268469,
0.2061843, -1.1228663, -0.1443778, 4.8789902, 1.3480328, 0.4258486
), s10 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
0.5211859, 0.2196643, -1.2333367, 0.1186947, 1.478086, 0.5211859,
0.2196643)), .Names = c("s1", "s2", "s3", "s4", "s5", "s6", "s7",
"s8", "s9", "s10"), class = "data.frame", row.names = c(NA, -20L
))
A solution with tryCatch could be
for(i in 1:nrow(dat)){
print(i)
corr <- tryCatch(cor.test(as.numeric(dat[i,]), v1, na.action = "na.omit"), error = function(e) return(NA))
if(length(corr) == 1){
result_table[i,1] <- NA
result_table[i,2] <- NA
}else{
result_table[i,1] <- corr$estimate
result_table[i,2] <- corr$p.value
}
}
Here is a solution with tryCatch():
Replacing the for loop with:
for(i in 1:nrow(dat)){
tryCatch({
print(i)
corr <- cor.test(as.numeric(dat[i,]), v1, na.action = "na.omit") # Correlation miRNA activity vs CNVs for that gene
result_table[i,1] <- corr$estimate
result_table[i,2] <- corr$p.value
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
}

pivoting data with rownames to be colnames r [duplicate]

This question already has answers here:
Reshaping data.frame from wide to long format
(8 answers)
Closed 5 years ago.
I have the following dataset
structure(list(Year = c("Oranges", "Cherrys", "Apples", "Bananas"
), `42461` = c(0, NA, 12, NA), `42491` = c(1, 12, NA, NA), `42522` = c(1,
12, 7, NA), `42552` = c(NA, 12, 6, NA), `42583` = c(2, NA, 8,
NA), `42614` = c(NA, 12, 5, NA), `42644` = c(NA, NA, 4, NA),
`42675` = c(NA, 12, NA, NA), `42705` = c(NA, 3, NA, NA),
`42736` = c(NA, NA, 12, NA), `42767` = c(NA, NA, 12, NA),
`42795` = c(NA, 12, NA, NA), Total = c(0, 0, 0, 0)), .Names = c("Year",
"42461", "42491", "42522", "42552", "42583", "42614", "42644",
"42675", "42705", "42736", "42767", "42795", "Total"), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -4L))
I would like to pivot it to look like:
Category-Values-Year
I tried the following:
datdat %>% gather(Cat,Var)
but the problem is that the year is the name of each column.
I removed the "Totals" column, I'm not sure if this is what you're asking for:
library (data.table)
dat = data.table (structure(list(Year = c("Oranges", "Cherrys", "Apples",
"Bananas"
), `42461` = c(0, NA, 12, NA), `42491` = c(1, 12, NA, NA), `42522` = c(1,
12, 7, NA), `42552` = c(NA, 12, 6, NA), `42583` = c(2, NA, 8,
NA), `42614` = c(NA, 12, 5, NA), `42644` = c(NA, NA, 4, NA),
`42675` = c(NA, 12, NA, NA), `42705` = c(NA, 3, NA, NA),
`42736` = c(NA, NA, 12, NA), `42767` = c(NA, NA, 12, NA),
`42795` = c(NA, 12, NA, NA), Total = c(0, 0, 0, 0)), .Names = c("Year",
"42461", "42491", "42522", "42552", "42583", "42614", "42644",
"42675", "42705", "42736", "42767", "42795", "Total"), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -4L)))
names (dat)[1] = "Category"
dat [, "Total" := NULL]
melt.dat = melt (dat, id.vars = c("Category"), variable.name = "Year")
melt.dat gives you:
> head (melt.dat)
Category Year value
1: Oranges 42461 0
2: Cherrys 42461 NA
3: Apples 42461 12
4: Bananas 42461 NA
5: Oranges 42491 1
6: Cherrys 42491 12
Also note, the table is a data.table, not a data.frame :)
Forgot to mention, run install.packages ("data.table") if you don't have it yet

How can I get highcharter to represent a forecast object?

This is a follow-on to this question.
I am trying to get the pipeline given in that question to accept a forecast object as input:
Again, using this data:
> dput(t)
structure(c(2, 2, 267822980, 325286564, 66697091, 239352431,
94380295, 1, 126621669, 158555699, 32951026, 23, 108000151, 132505189,
29587564, 120381505, 25106680, 117506099, 22868767, 115940080,
22878163, 119286731, 22881061), .Dim = c(23L, 1L), index = structure(c(1490990400,
1490994000, 1490997600, 1491001200, 1491004800, 1491008400, 1491012000,
1491026400, 1491033600, 1491037200, 1491040800, 1491058800, 1491062400,
1491066000, 1491069600, 1491073200, 1491076800, 1491109200, 1491112800,
1491120000, 1491123600, 1491156000, 1491159600), tzone = "US/Mountain", tclass = c("POSIXct",
"POSIXt")), class = c("xts", "zoo"), .indexCLASS = c("POSIXct",
"POSIXt"), tclass = c("POSIXct", "POSIXt"), .indexTZ = "US/Mountain", tzone = "US/Mountain", .CLASS = "double", .Dimnames = list(
NULL, "count"))
I use
highchart(type = 'stock') %>%
hc_add_series(t) %>%
hc_xAxis(type = 'datetime')
To create
But if I follow this same recipe using
require("forecast")
t.arima <- auto.arima(t)
x <- forecast(t.arima, level = c(95, 80))
highchart(type = 'stock') %>%
hc_add_series(x) %>%
hc_xAxis(type = 'datetime')
I get this error:
Error in as.Date.ts(.) : unable to convert ts time to Date class
How can I show the forecast series along with the historical? I've seen this in the documentation, but don't understand why I'd be getting this error.
JS CONSOLE OUTPUT FOR JK:
DF DATA AFTER RE-INDEXING:
dput(df)
structure(list(Index = structure(c(1490968800, 1490972400, 1490976000,
1490979600, 1490983200, 1490986800, 1490990400, 1491004800, 1491012000,
1491015600, 1491019200, 1491037200, 1491040800, 1491044400, 1491048000,
1491051600, 1491055200, 1491087600, 1491091200, 1491098400, 1491102000,
1491134400, 1491138000, 1491217200, 1491220800, 1491224400, 1491228000,
1491231600, 1491235200, 1491238800, 1491242400, 1491246000, 1491249600,
1491253200, 1491256800, 1491260400, 1491264000, 1491267600), class = c("POSIXct",
"POSIXt")), Data = c(2, 2, 259465771, 315866206, 64582553, 233440220,
91918347, 1, 126563786, 158555699, 32951026, 23, 108000151, 132505189,
29587564, 120381505, 25106680, 117506099, 22868767, 115898351,
22878163, 119285747, 22881061, 157925588, 32447780, 223096830,
281656273, 45406684, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
Fitted = c(102170573.857143, 102170573.857143, 102170573.857143,
102170573.857143, 102170573.857143, 102170573.857143, 102170573.857143,
102170573.857143, 102170573.857143, 102170573.857143, 102170573.857143,
102170573.857143, 102170573.857143, 102170573.857143, 102170573.857143,
102170573.857143, 102170573.857143, 102170573.857143, 102170573.857143,
102170573.857143, 102170573.857143, 102170573.857143, 102170573.857143,
102170573.857143, 102170573.857143, 102170573.857143, 102170573.857143,
102170573.857143, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
`Point Forecast` = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 102170573.857143, 102170573.857143, 102170573.857143,
102170573.857143, 102170573.857143, 102170573.857143, 102170573.857143,
102170573.857143, 102170573.857143, 102170573.857143), `Lo 80` = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -16003477.5789723,
-16003477.5789723, -16003477.5789723, -16003477.5789723,
-16003477.5789723, -16003477.5789723, -16003477.5789723,
-16003477.5789723, -16003477.5789723, -16003477.5789723),
`Hi 80` = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 220344625.293258, 220344625.293258, 220344625.293258,
220344625.293258, 220344625.293258, 220344625.293258, 220344625.293258,
220344625.293258, 220344625.293258, 220344625.293258), `Lo 95` = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -78561041.5917782,
-78561041.5917782, -78561041.5917782, -78561041.5917782,
-78561041.5917782, -78561041.5917782, -78561041.5917782,
-78561041.5917782, -78561041.5917782, -78561041.5917782),
`Hi 95` = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 282902189.306064, 282902189.306064, 282902189.306064,
282902189.306064, 282902189.306064, 282902189.306064, 282902189.306064,
282902189.306064, 282902189.306064, 282902189.306064)), .Names = c("Index",
"Data", "Fitted", "Point Forecast", "Lo 80", "Hi 80", "Lo 95",
"Hi 95"), row.names = c(NA, -38L), class = "data.frame")
Not sure this is due to the irregular time series.
Anyway, ggfortify:::fortify.forecast is your friend. Why? Because fortify (try to) transform all the R object in data frames. So:
library(highcharter)
library(forecast)
t.arima <- auto.arima(t)
x <- forecast(t, level = c(95, 80))
library(highcharter)
library(ggplot2)
library(ggfortify)
#>
#> Attaching package: 'ggfortify'
#> The following object is masked from 'package:forecast':
#>
#> gglagplot
class(x)
#> [1] "forecast"
df <- fortify(x)
head(df)
#> Index Data Fitted Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
#> 1 1 2 140658844 NA NA NA NA NA
#> 2 3601 2 121734145 NA NA NA NA NA
#> 3 7201 267822980 105355638 NA NA NA NA NA
#> 4 10801 325286564 127214522 NA NA NA NA NA
#> 5 14401 66697091 153863779 NA NA NA NA NA
#> 6 18001 239352431 142136089 NA NA NA NA NA
Now you can:
highchart(type = "stock") %>%
hc_add_series(df, "line", hcaes(Index, Data), name = "Original") %>%
hc_add_series(df, "line", hcaes(Index, Fitted), name = "Fitted") %>%
hc_add_series(df, "line", hcaes(Index, `Point Forecast`), name = "Forecast") %>%
hc_add_series(df, "arearange", hcaes(Index, low = `Lo 80`, high = `Hi 80`), name = "Interval")
As you can see, fortify can't detect the real time too. So you need to transform the Index in the time what you want.
The error
Error in as.Date.ts(.) : unable to convert ts time to Date class
is due to the fact that you have a ts object with a frequency that is not covered by the function as.Date.ts(.). When we see what this function does, this is what we get:
function (x, offset = 0, ...)
{
time.x <- unclass(time(x)) + offset
if (frequency(x) == 1)
as.Date(paste(time.x, 1, 1, sep = "-"))
else if (frequency(x) == 4)
as.Date(paste((time.x + 0.001)%/%1, 3 * (cycle(x) - 1) +
1, 1, sep = "-"))
else if (frequency(x) == 12)
as.Date(paste((time.x + 0.001)%/%1, cycle(x), 1, sep = "-"))
else stop("unable to convert ts time to Date class")
}
This function considers only 3 values for the frequency of a ts object: 1, 4, or 12. When we take a look at the frequency of your object x, we see that its frequency = 0.000277777777777778, so when highcharter calls the function using the ts objects in x it stops and gives you that error.
We have two options on how to "fix" it:
Transform t into a ts object (instead of a xts object) with frequency = 1 before running auto.arima and forecast;
After running auto.arima and forecast, we can create an index for the future dates and transform the ts objects in x into xts objects with the correct index.
I said "fix" because these solutions are not perfect, as we will see.
Option 1
t <- structure(
c(2, 2, 267822980, 325286564, 66697091, 239352431,
94380295, 1, 126621669, 158555699, 32951026, 23,
108000151, 132505189, 29587564, 120381505, 25106680,
117506099, 22868767, 115940080, 22878163, 119286731,
22881061),
.Dim = c(23L, 1L),
index = structure(c(1490990400, 1490994000, 1490997600,
1491001200, 1491004800, 1491008400,
1491012000, 1491026400, 1491033600,
1491037200, 1491040800, 1491058800,
1491062400, 1491066000, 1491069600,
1491073200, 1491076800, 1491109200,
1491112800, 1491120000, 1491123600,
1491156000, 1491159600),
tzone = "US/Mountain",
tclass = c("POSIXct","POSIXt")),
class = c("xts", "zoo"),
.indexCLASS = c("POSIXct","POSIXt"),
tclass = c("POSIXct", "POSIXt"),
.indexTZ = "US/Mountain",
tzone = "US/Mountain",
.CLASS = "double",
.Dimnames = list(NULL, "count"))
require("forecast")
library(highcharter)
# SOLUTION 1
t.tmp <- ts(t, start=1, end = length(t))
t.arima.1 <- auto.arima(t.tmp)
x.1 <- forecast(t.arima.1, level = c(95, 80))
highchart(type = 'stock') %>%
hc_add_series(x.1) %>%
hc_add_series(x.1$x, name = "Original") %>%
hc_add_series(x.1$fitted, name = "Fitted")
The problem with this approach is that we lose the dates (axis, tooltip, etc.).
Option 2, 1st try: Hourly Forecasts
I tried to create an hourly index for the future values, but for some reason Highcharter moves the intervals to the left (or there's some problem with the dates that I can't see/figure out).
Option 2, 2nd try: Daily Forecasts
When I changed it to a daily index for the future values it worked, but it's weird since we have hourly observations and the forecast part of our plot shows "daily forecasts".
Here is the full code:
t <- structure(
c(2, 2, 267822980, 325286564, 66697091, 239352431,
94380295, 1, 126621669, 158555699, 32951026, 23,
108000151, 132505189, 29587564, 120381505, 25106680,
117506099, 22868767, 115940080, 22878163, 119286731,
22881061),
.Dim = c(23L, 1L),
index = structure(c(1490990400, 1490994000, 1490997600,
1491001200, 1491004800, 1491008400,
1491012000, 1491026400, 1491033600,
1491037200, 1491040800, 1491058800,
1491062400, 1491066000, 1491069600,
1491073200, 1491076800, 1491109200,
1491112800, 1491120000, 1491123600,
1491156000, 1491159600),
tzone = "US/Mountain",
tclass = c("POSIXct","POSIXt")),
class = c("xts", "zoo"),
.indexCLASS = c("POSIXct","POSIXt"),
tclass = c("POSIXct", "POSIXt"),
.indexTZ = "US/Mountain",
tzone = "US/Mountain",
.CLASS = "double",
.Dimnames = list(NULL, "count"))
require("forecast")
library(highcharter)
library(xts)
t.arima <- auto.arima(t)
x <- forecast(t.arima, level = c(95, 80))
# Problem
## Time from 'forecast'
time.x <- time(x$mean) # ts variable
time.x # see that frequency = 0.000277777777777778
## Original time
time.t <- time(t) # POSIXct variable, use as.ts to see frequency
as.ts(time.t) # frequency = 1
## Try to transform back to formatted date
as.POSIXct(as.double(time.t), tz = "US/Mountain", origin = "1970-01-01")
as.POSIXct(as.double(time.x), tz = "US/Mountain", origin = "1970-01-01")
#--------------------------------------------------------#
# SOLUTION 1
t.tmp <- ts(t, start=1, end = length(t))
t.arima.1 <- auto.arima(t.tmp)
x.1 <- forecast(t.arima.1, level = c(95, 80))
highchart(type = 'stock') %>%
hc_add_series(x.1) %>%
hc_add_series(x.1$x, name = "Original") %>%
hc_add_series(x.1$fitted, name = "Fitted")
#------------------------------------------------------#
# SOLUTION 2 - With correct dates but wrong plot
## Create new forecast variable
x.2 <- forecast(t.arima.1, level = c(95, 80))
## Take forecast length
forecast.length <- length(time.x)
### Create New Forecast dates (HOUR)
### Since I don't know the exact forecast times, I'll add one HOUR
### for each obs starting from the last date in the original dataset
last.date <- time.t[length(time.t)]
new.forecast.time.hour <- as.POSIXct(last.date) + c((1:forecast.length)*3600)
## Insert date back
x.2$mean <- xts(x.1$mean, order.by = new.forecast.time.hour)
x.2$lower <- xts(x.1$lower, order.by = new.forecast.time.hour)
x.2$upper <- xts(x.1$upper, order.by = new.forecast.time.hour)
### Original Data
x.2$x <- xts(x.1$x, order.by = time.t)
### Fitted
x.2$fitted <- xts(x.1$fitted, order.by = time.t)
# Plot forecasts with correct date
highchart(type = 'stock') %>%
hc_add_series(x.2) %>%
hc_add_series(x.2$x, name = "Original") %>%
hc_add_series(x.2$fitted, name = "Fitted") %>%
hc_xAxis(type = 'datetime')
#------------------------------------------------------#
# SOLUTION 3 - Correct plot but only for daily forecasts
## Create new forecast variable
x.3 <- forecast(t.arima.1, level = c(95, 80))
## Take forecast length
forecast.length <- length(time.x)
### Create New Forecast dates (DAY)
### Since I don't know the exact forecast times, I'll add one DAY
### for each obs starting from the last date in the original dataset
last.date <- time.t[length(time.t)]
new.forecast.time.day <- as.POSIXct(last.date) + c((1:forecast.length)*3600*24)
## Add change from as.POSIXct to as.Date
new.forecast.time.day <- as.Date(new.forecast.time.day)
## Insert date back
x.3$mean <- xts(x.1$mean, order.by = new.forecast.time.day)
x.3$lower <- xts(x.1$lower, order.by = new.forecast.time.day)
x.3$upper <- xts(x.1$upper, order.by = new.forecast.time.day)
### Original Data
x.3$x <- xts(x.1$x, order.by = time.t)
### Fitted
x.3$fitted <- xts(x.1$fitted, order.by = time.t)
# Plot forecasts with correct date
highchart(type = 'stock') %>%
hc_add_series(x.3) %>%
hc_add_series(x.3$x, name = "Original") %>%
hc_add_series(x.3$fitted, name = "Fitted") %>%
hc_xAxis(type = 'datetime')
One other thing: the fitted values on my plots differ from the fitted values on jbkunst's plot because he used forecast directly on t, not on t.arima (just a typo, I believe). This way, my forecasts are based on an Arima model, while his are based on an ETS model.

Resources