Create a script with 2 vectors in R

Create a script with 2 vectors in R - r

I'm using Heatmap from the package complexheatmap
in the script, I need to create a variable ha_column that I will incorporate into my script.
ha_column = HeatmapAnnotation (df = data.frame(type1=c(rep("name1",5), rep("name2",5),rep("name3",5), col = list(type1=c("name1" = "#DCDCDC", "name2" = "#DC928B", "name2"="#BA72D3")))))
I have 2 vectors:
vectors1=c("name1","name2","name3)
vectors2=c("#DCDCDC","#DC928B","#BA72D3")
and the idea is to reproduce the above script with these two vectors.
I tried:
paste0("ha_column = HeatmapAnnotation(df = data.frame(type1 = c(rep(",vectors1,", 5),col = list(type1 = c(",vectors1,"=",vectors2,")))")
bu it only paste line by line such as:
[1] "ha_column = HeatmapAnnotation(df = data.frame(type1 = c(rep(name1, 5),col = list(type1 = c(name1=#DCDCDC)))"
[2] "ha_column = HeatmapAnnotation(df = data.frame(type1 = c(rep(name2, 5),col = list(type1 = c(name2=#DC928B)))"
[3] "ha_column = HeatmapAnnotation(df = data.frame(type1 = c(rep(name3, 5),col = list(type1 = c(name3=#BA72D3)))"
instead of doing what I want ...
Does anyone have an idea?
Thanks for your time.

It's generally not a good idea to build code as a string. Instead think of building a function to do what you want.
You could do something line
ha_column_fun = function(names, colors) {
HeatmapAnnotation(
df = data.frame(type1 = rep(names, each=5)),
col = list(type1=setNames(colors, names))
)
}
And then you could call it with
ha_column = ha_column_fun(vectors1, vectors2)

Related

Remove columns with many NA values using mlr3pipelines

I am trying to remove columns where proportion of NA value are greater than na_cutoff threshold using mlr3pipelines.
Here is my try:
library(mlr3)
library(mlr3pipelines)
task = tsk("iris")
dt = task$data()
dt[1:50, Sepal.Width := NA]
task_ = as_task_classif(dt, target = "Species")
graph = po("removeconstants", id = "removeconstants", ratio = 0.01) %>>%
po("select", id = "drop_na_cols")
ps = ParamSet$new(list(ParamDbl$new("na_cutoff", lower = 0, upper = 1, default = 0.2)))
graph$param_set$add(ps)
graph$param_set
graph$param_set$trafo = function(x, param_set) {
na_cutoff = x$na_cutoff
print(na_cutoff)
x$drop_na_cols.selector = function(task) {
fn = task$feature_names
data = task$data(cols = fn)
drop <- which(colMeans(is.na(data)) > na_cutoff)
fn[-drop]
}
x$na_cutoff = NULL
x
}
train_res = graph$train(task_)
train_res$drop_na_cols.output$data()
The problem is that last column is not removed even it should be.

In general, trafos are not meant for parameter sets.
I.e. internally, when the Graph accesses the parameters, the parameter transformation is not applied.
They are intended to create search spaces for black-box optimization, including hyperparameter optimization of ML models.
Also, you modifying the parameter set of an existing Graph is a bad idea.
The way to go I believe is to use the PipeOpSelect with a custom selector: https://mlr3pipelines.mlr-org.com/reference/Selector.html

Following this issue https://github.com/mlr-org/mlr3pipelines/issues/313
I thought the recommended way to do this is through trafo on select pipe.
Nevertheless, I have just created new pipeop that removes columns with many NA values:
library(mlr3pipelines)
library(mlr3verse)
library(mlr3misc)
library(R6)
PipeOpDropNACol = R6::R6Class(
"PipeOpDropNACol",
inherit = mlr3pipelines::PipeOpTaskPreprocSimple,
public = list(
initialize = function(id = "drop.nacol", param_vals = list()) {
ps = ParamSet$new(list(
ParamDbl$new("cutoff", lower = 0, upper = 1, default = 0.05, tags = c("dropnacol_tag"))
))
ps$values = list(cutoff = 0.2)
super$initialize(id, param_set = ps, param_vals = param_vals)
}
),
private = list(
.get_state = function(task) {
pv = self$param_set$get_values(tags = "dropnacol_tag")
print(pv$cutoff)
features_names = task$feature_names
data = task$data(cols = features_names)
print(data)
many_na = sapply(data, function(column) (sum(is.na(column))) / length(column) > pv$cutoff)
print(many_na)
list(cnames = colnames(data)[-many_na])
},
.transform = function(task) {
task$select(self$state$cnames)
}
)
)
# no group variable
task = tsk("iris")
dt = task$data()
dt[1:50, Sepal.Width := NA]
task = as_task_classif(dt, target = "Species")
gr = Graph$new()
gr$add_pipeop(PipeOpDropNACol$new())
result = gr$train(task)
result[[1]]$data()
gr$predict(task)

How to convert a dataframe from questions based columns to answers based columns

I have this example dataframe
questions = data.frame(subjects = 1:8,
are_you_sad = c(1,1,2,3,4,5,3,2),
are_you_worried = c(1,3,1,2,2,4,5,3))
and I want to convert it to:
questions = data.frame(subjects = 1:8,
are_you_sad_1 = c(1,1,0,0,0,0,0,0),
are_you_sad_2 = c(0,0,1,0,0,0,0,1),
are_you_sad_3 = c(0,0,0,1,0,0,1,0),
are_you_sad_4 = c(0,0,0,0,1,0,0,0),
are_you_sad_5 = c(0,0,0,0,0,1,0,0),
are_you_worried_1 = c(1,0,1,0,0,0,0,0),
are_you_worried_2 = c(0,0,0,1,1,0,0,0),
are_you_worried_3 = c(0,1,0,0,0,0,0,1),
are_you_worried_4 = c(0,0,0,0,0,1,0,0),
are_you_worried_5 = c(0,0,0,0,0,0,1,0)
)
can someone guide me through a simple function to make that possible? thanks.

Use fastDummies::dummy_cols:
library(fastDummies)
dummy_cols(questions,
select_columns = c("are_you_sad", "are_you_worried"),
remove_selected_columns = TRUE)

making 2 box plots from the same data frame in R

I want to make a 2 box plots with y being weight and x being the before and after. so two different boxplot will be displayed at the same time.
`rats_before = data.frame(
rat_num = paste0(rep("rat number",200),1:200),
weight = rweibull(200,shape= 10,scale = 20))
rats_after = data.frame(
rat_num = paste0(rep("rat number",200),1:200),
weight = rweibull(200,shape= 9,scale = 21))
rats = merge(rats_before,rats_after, by = c("rat_num"))`
i know the next part is not even close but it will give you a idea of what im trying to do.
rat_boxplot = qplot(y = weight, x = (rats_after, rats_before), geom = "boxplot", data = rats)

Or, if you want to do this in base R -
rats_before = data.frame(
rat_num = paste0(rep("rat number",200),1:200),
weight = rweibull(200,shape= 10,scale = 20))
rats_after = data.frame(
rat_num = paste0(rep("rat number",200),1:200),
weight = rweibull(200,shape= 9,scale = 21))
rats <- rbind(rats_before, rats_after)
rats$type <- c(rep("before", nrow(rats_before)), rep("after", nrow(rats_after)))
rats$type <- factor(rats$type)
rats$type <- relevel(rats$type, ref = 2)
boxplot(weight ~ type, data = rats)

You can add a column to each df ans userbind which will bind the rows of the two df instead of merge you can use. Then you simply have to use the aes of a ggplot.
rats_before$condition = "before"
rats_after$condition = "after"
rats = rbind(rats_before,rats_after)
ggplot(rats)+geom_boxplot(aes(condition,weight))
Hope I understood your question.
Tom

HeatMap: how to cluster only the rows and keep order of the heatmap's column labels as same as in the df?

I wanna plot a heatmap and cluster only the rows (i.e. genes in this tydf1).
Also, wanna keep order of the heatmap's column labels as same as in the df (i.e. tydf1)?
Sample data
df1 <- structure(list(Gene = c("AA", "PQ", "XY", "UBQ"), X_T0_R1 = c(1.46559502, 0.220140568, 0.304127515, 1.098842127), X_T0_R2 = c(1.087642983, 0.237500819, 0.319844338, 1.256624804), X_T0_R3 = c(1.424945196, 0.21066267, 0.256496284, 1.467120048), X_T1_R1 = c(1.289943948, 0.207778662, 0.277942721, 1.238400358), X_T1_R2 = c(1.376535013, 0.488774258, 0.362562315, 0.671502431), X_T1_R3 = c(1.833390311, 0.182798731, 0.332856558, 1.448757569), X_T2_R1 = c(1.450753714, 0.247576125, 0.274415259, 1.035410946), X_T2_R2 = c(1.3094609, 0.390028842, 0.352460646, 0.946426593), X_T2_R3 = c(0.5953716, 1.007079177, 1.912258811, 0.827119776), X_T3_R1 = c(0.7906009, 0.730242116, 1.235644748, 0.832287694), X_T3_R2 = c(1.215333041, 1.012914813, 1.086362205, 1.00918082), X_T3_R3 = c(1.069312467, 0.780421013, 1.002313082, 1.031761442), Y_T0_R1 = c(0.053317766, 3.316414959, 3.617213894, 0.788193798), Y_T0_R2 = c(0.506623748, 3.599442788, 1.734075583, 1.179462912), Y_T0_R3 = c(0.713670106, 2.516735845, 1.236204882, 1.075393433), Y_T1_R1 = c(0.740998252, 1.444496448, 1.077023349, 0.869258744), Y_T1_R2 = c(0.648231834, 0.097957459, 0.791438659, 0.428805547), Y_T1_R3 = c(0.780499252, 0.187840968, 0.820430227, 0.51636582), Y_T2_R1 = c(0.35344654, 1.190274584, 0.401845911, 1.223534348), Y_T2_R2 = c(0.220223951, 1.367784148, 0.362815405, 1.102117612), Y_T2_R3 = c(0.432856978, 1.403057729, 0.10802472, 1.304233845), Y_T3_R1 = c(0.234963735, 1.232129062, 0.072433381, 1.203096462), Y_T3_R2 = c(0.353770497, 0.885122768, 0.011662112, 1.188149743), Y_T3_R3 = c(0.396091395, 1.333921747, 0.192594116, 1.838029829), Z_T0_R1 = c(0.398000559, 1.286528398, 0.129147097, 1.452769794), Z_T0_R2 = c(0.384759325, 1.122251177, 0.119475721, 1.385513609), Z_T0_R3 = c(1.582230097, 0.697419716, 2.406671502, 0.477415567), Z_T1_R1 = c(1.136843842, 0.804552001, 2.13213228, 0.989075996), Z_T1_R2 = c(1.275683837, 1.227821594, 0.31900326, 0.835941568), Z_T1_R3 = c(0.963349308, 0.968589683, 1.706670339, 0.807060135), Z_T2_R1 = c(3.765036263, 0.477443352, 1.712841882, 0.469173869), Z_T2_R2 = c(1.901023385, 0.832736132, 2.223429427, 0.593558769), Z_T2_R3 = c(1.407713024, 0.911920317, 2.011259223, 0.692553388), Z_T3_R1 = c(0.988333629, 1.095130142, 1.648598854, 0.629915612), Z_T3_R2 = c(0.618606729, 0.497458337, 0.549147265, 1.249492088), Z_T3_R3 = c(0.429823986, 0.471389536, 0.977124788, 1.136635484)), row.names = c(NA, -4L ), class = c("data.table", "data.frame"))
Scripts used
library(dplyr)
library(stringr)
library(tidyr)
gdf1 <- gather(df1, "group", "Expression", -Gene)
gdf1$tgroup <- apply(str_split_fixed(gdf1$group, "_", 3)[, c(1, 2)],
1, paste, collapse ="_")
library(dplyr)
tydf1 <- gdf1 %>%
group_by(Gene, tgroup) %>%
summarize(expression_mean = mean(Expression)) %>%
spread(., tgroup, expression_mean)
#1 heatmap script is being used
library(tidyverse)
tydf1 <- tydf1 %>%
as.data.frame() %>%
column_to_rownames(var=colnames(tydf1)[1])
library(gplots)
library(vegan)
randup.m <- as.matrix(tydf1)
scaleRYG <- colorRampPalette(c("red","yellow","darkgreen"),
space = "rgb")(30)
data.dist <- vegdist(randup.m, method = "euclidean")
row.clus <- hclust(data.dist, "aver")
heatmap.2(randup.m, Rowv = as.dendrogram(row.clus),
dendrogram = "row", col = scaleRYG, margins = c(7,10),
density.info = "none", trace = "none", lhei = c(2,6),
colsep = 1:3, sepcolor = "black", sepwidth = c(0.001,0.0001),
xlab = "Identifier", ylab = "Rows")
#2 heatmap script is being used
df2 <- as.matrix(tydf1[, -1])
heatmap(df2)
Also, I want to add a color key.

It is still unclear to me, what the desired output is. There are some notes:
You don't need to use vegdist() to calculate distance matrix for your hclust() call. Because if you check all(vegdist(randup.m, method = "euclidian") == dist(randup.m)) it returns TRUE;
Specifying Colv = F in your heatmap.2() call will prevent reordering of the columns (default is TRUE);
Maybe it is better to scale your data by row (see the uncommented row);
Your call of heatmap.2() returns the heatmap with color key.
So summing it up - in your first script you just miss the Colv = F argument, and after a little adjustment it looks like this:
heatmap.2(randup.m,
Rowv = as.dendrogram(row.clus),
Colv = F,
dendrogram = "row",
#scale = "row",
col = scaleRYG,
density.info = "none",
trace = "none",
srtCol = -45,
adjCol = c(.1, .5),
xlab = "Identifier",
ylab = "Rows"
)
However I am still not sure - is it what you need?

Create argument list using lapply for do.call

I'm trying to pass a set of modified arguments from a larger function to arguments in a nested function. This is an argument supplied from the larger function:
time_dep_covariates_list = c(therapy_start = "Start of Therapy",
therapy_end = "End of Therapy")
I have these sets of constant arguments:
tmerge_args_1 <- alist(data1 = analytic_dataset,
data2 = analytic_dataset,
id = patientid,
tstop = adv_dx_to_event,
death_censor = event(adv_dx_to_event))
And I want to append these modified arguments to that argument list:
tmerge_args_2 <- lapply(1:length(time_dep_covariates_list), function(x){
tmerge_args <<- c(tmerge_args, alist('var' = tdc(var)) )
paste0(names(time_dep_covariates_list[x])," =
tdc(",names(time_dep_covariates_list[x]), ")")
})
> tdc_args
[[1]]
[1] "therapy_start = tdc(therapy_start)"
[[2]]
[1] "therapy_end = tdc(therapy_end)"
I want to create a do.call that handles the arguments like so:
count_process_form <- do.call(tmerge, args = c(tmerge_args_1,
tmerge_args_2)
That would be identical to the following:
tmerge(data1 = analytic_dataset, data2 = analytic_dataset,
id = patientid, tstop = adv_dx_to_event,
therapy_start = tdc(therapy_start), therapy_end = tdc(therapy_end)
It works fine with tmerge_args_1 by itself, but as the args_2 are character and not language elements, I get this error:
Error in (function (data1, data2, id, ..., tstart, tstop, options) :
all additional argments [sic] must have a name:
How can I modify the list I'm creating for args_2 so they're stored as arguments that do.call can understand? Or am I approaching this all wrong?
Thanks!
Here is a reproducible example:
analytic_dataset= data_frame(patientid = sample(1:1000,5),
adv_dx_to_event = sample(100:200, 5),
death_censor = sample(0:1,5, replace = T),
therapy_start = sample(1:20,5),
therapy_stop = sample(40:100,5))
The below would be passed in from a function:
time_dep_covariates_list = c(therapy_start = "Start of Therapy",
therapy_end = "End of Therapy")
tmerge_args_1 <- alist(data1 = analytic_dataset,
data2 = analytic_dataset,
id = patientid,
tstop = adv_dx_to_event,
death_censor = event(adv_dx_to_event))
do.call(tmerge,tmerge_args_1) #this works
tmerge_args_2 <- lapply(1:length(time_dep_covariates_list), function(x){
tmerge_args <<- c(tmerge_args, alist('var' = tdc(var)) )
paste0(names(time_dep_covariates_list[x])," = tdc(",names(time_dep_covariates_list[x]), ")")
})
do.call(tmerge,tmerge_args_1,tmerge_args_2) # this doesn't```

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Create a script with 2 vectors in R - r

Related

Remove columns with many NA values using mlr3pipelines

How to convert a dataframe from questions based columns to answers based columns

making 2 box plots from the same data frame in R

HeatMap: how to cluster only the rows and keep order of the heatmap's column labels as same as in the df?

Create argument list using lapply for do.call

Categories

Resources