Related
I am trying to remove columns where proportion of NA value are greater than na_cutoff threshold using mlr3pipelines.
Here is my try:
library(mlr3)
library(mlr3pipelines)
task = tsk("iris")
dt = task$data()
dt[1:50, Sepal.Width := NA]
task_ = as_task_classif(dt, target = "Species")
graph = po("removeconstants", id = "removeconstants", ratio = 0.01) %>>%
po("select", id = "drop_na_cols")
ps = ParamSet$new(list(ParamDbl$new("na_cutoff", lower = 0, upper = 1, default = 0.2)))
graph$param_set$add(ps)
graph$param_set
graph$param_set$trafo = function(x, param_set) {
na_cutoff = x$na_cutoff
print(na_cutoff)
x$drop_na_cols.selector = function(task) {
fn = task$feature_names
data = task$data(cols = fn)
drop <- which(colMeans(is.na(data)) > na_cutoff)
fn[-drop]
}
x$na_cutoff = NULL
x
}
train_res = graph$train(task_)
train_res$drop_na_cols.output$data()
The problem is that last column is not removed even it should be.
In general, trafos are not meant for parameter sets.
I.e. internally, when the Graph accesses the parameters, the parameter transformation is not applied.
They are intended to create search spaces for black-box optimization, including hyperparameter optimization of ML models.
Also, you modifying the parameter set of an existing Graph is a bad idea.
The way to go I believe is to use the PipeOpSelect with a custom selector: https://mlr3pipelines.mlr-org.com/reference/Selector.html
Following this issue https://github.com/mlr-org/mlr3pipelines/issues/313
I thought the recommended way to do this is through trafo on select pipe.
Nevertheless, I have just created new pipeop that removes columns with many NA values:
library(mlr3pipelines)
library(mlr3verse)
library(mlr3misc)
library(R6)
PipeOpDropNACol = R6::R6Class(
"PipeOpDropNACol",
inherit = mlr3pipelines::PipeOpTaskPreprocSimple,
public = list(
initialize = function(id = "drop.nacol", param_vals = list()) {
ps = ParamSet$new(list(
ParamDbl$new("cutoff", lower = 0, upper = 1, default = 0.05, tags = c("dropnacol_tag"))
))
ps$values = list(cutoff = 0.2)
super$initialize(id, param_set = ps, param_vals = param_vals)
}
),
private = list(
.get_state = function(task) {
pv = self$param_set$get_values(tags = "dropnacol_tag")
print(pv$cutoff)
features_names = task$feature_names
data = task$data(cols = features_names)
print(data)
many_na = sapply(data, function(column) (sum(is.na(column))) / length(column) > pv$cutoff)
print(many_na)
list(cnames = colnames(data)[-many_na])
},
.transform = function(task) {
task$select(self$state$cnames)
}
)
)
# no group variable
task = tsk("iris")
dt = task$data()
dt[1:50, Sepal.Width := NA]
task = as_task_classif(dt, target = "Species")
gr = Graph$new()
gr$add_pipeop(PipeOpDropNACol$new())
result = gr$train(task)
result[[1]]$data()
gr$predict(task)
I have this example dataframe
questions = data.frame(subjects = 1:8,
are_you_sad = c(1,1,2,3,4,5,3,2),
are_you_worried = c(1,3,1,2,2,4,5,3))
and I want to convert it to:
questions = data.frame(subjects = 1:8,
are_you_sad_1 = c(1,1,0,0,0,0,0,0),
are_you_sad_2 = c(0,0,1,0,0,0,0,1),
are_you_sad_3 = c(0,0,0,1,0,0,1,0),
are_you_sad_4 = c(0,0,0,0,1,0,0,0),
are_you_sad_5 = c(0,0,0,0,0,1,0,0),
are_you_worried_1 = c(1,0,1,0,0,0,0,0),
are_you_worried_2 = c(0,0,0,1,1,0,0,0),
are_you_worried_3 = c(0,1,0,0,0,0,0,1),
are_you_worried_4 = c(0,0,0,0,0,1,0,0),
are_you_worried_5 = c(0,0,0,0,0,0,1,0)
)
can someone guide me through a simple function to make that possible? thanks.
Use fastDummies::dummy_cols:
library(fastDummies)
dummy_cols(questions,
select_columns = c("are_you_sad", "are_you_worried"),
remove_selected_columns = TRUE)
I want to make a 2 box plots with y being weight and x being the before and after. so two different boxplot will be displayed at the same time.
`rats_before = data.frame(
rat_num = paste0(rep("rat number",200),1:200),
weight = rweibull(200,shape= 10,scale = 20))
rats_after = data.frame(
rat_num = paste0(rep("rat number",200),1:200),
weight = rweibull(200,shape= 9,scale = 21))
rats = merge(rats_before,rats_after, by = c("rat_num"))`
i know the next part is not even close but it will give you a idea of what im trying to do.
rat_boxplot = qplot(y = weight, x = (rats_after, rats_before), geom = "boxplot", data = rats)
Or, if you want to do this in base R -
rats_before = data.frame(
rat_num = paste0(rep("rat number",200),1:200),
weight = rweibull(200,shape= 10,scale = 20))
rats_after = data.frame(
rat_num = paste0(rep("rat number",200),1:200),
weight = rweibull(200,shape= 9,scale = 21))
rats <- rbind(rats_before, rats_after)
rats$type <- c(rep("before", nrow(rats_before)), rep("after", nrow(rats_after)))
rats$type <- factor(rats$type)
rats$type <- relevel(rats$type, ref = 2)
boxplot(weight ~ type, data = rats)
You can add a column to each df ans userbind which will bind the rows of the two df instead of merge you can use. Then you simply have to use the aes of a ggplot.
rats_before$condition = "before"
rats_after$condition = "after"
rats = rbind(rats_before,rats_after)
ggplot(rats)+geom_boxplot(aes(condition,weight))
Hope I understood your question.
Tom
I wanna plot a heatmap and cluster only the rows (i.e. genes in this tydf1).
Also, wanna keep order of the heatmap's column labels as same as in the df (i.e. tydf1)?
Sample data
df1 <- structure(list(Gene = c("AA", "PQ", "XY", "UBQ"), X_T0_R1 = c(1.46559502, 0.220140568, 0.304127515, 1.098842127), X_T0_R2 = c(1.087642983, 0.237500819, 0.319844338, 1.256624804), X_T0_R3 = c(1.424945196, 0.21066267, 0.256496284, 1.467120048), X_T1_R1 = c(1.289943948, 0.207778662, 0.277942721, 1.238400358), X_T1_R2 = c(1.376535013, 0.488774258, 0.362562315, 0.671502431), X_T1_R3 = c(1.833390311, 0.182798731, 0.332856558, 1.448757569), X_T2_R1 = c(1.450753714, 0.247576125, 0.274415259, 1.035410946), X_T2_R2 = c(1.3094609, 0.390028842, 0.352460646, 0.946426593), X_T2_R3 = c(0.5953716, 1.007079177, 1.912258811, 0.827119776), X_T3_R1 = c(0.7906009, 0.730242116, 1.235644748, 0.832287694), X_T3_R2 = c(1.215333041, 1.012914813, 1.086362205, 1.00918082), X_T3_R3 = c(1.069312467, 0.780421013, 1.002313082, 1.031761442), Y_T0_R1 = c(0.053317766, 3.316414959, 3.617213894, 0.788193798), Y_T0_R2 = c(0.506623748, 3.599442788, 1.734075583, 1.179462912), Y_T0_R3 = c(0.713670106, 2.516735845, 1.236204882, 1.075393433), Y_T1_R1 = c(0.740998252, 1.444496448, 1.077023349, 0.869258744), Y_T1_R2 = c(0.648231834, 0.097957459, 0.791438659, 0.428805547), Y_T1_R3 = c(0.780499252, 0.187840968, 0.820430227, 0.51636582), Y_T2_R1 = c(0.35344654, 1.190274584, 0.401845911, 1.223534348), Y_T2_R2 = c(0.220223951, 1.367784148, 0.362815405, 1.102117612), Y_T2_R3 = c(0.432856978, 1.403057729, 0.10802472, 1.304233845), Y_T3_R1 = c(0.234963735, 1.232129062, 0.072433381, 1.203096462), Y_T3_R2 = c(0.353770497, 0.885122768, 0.011662112, 1.188149743), Y_T3_R3 = c(0.396091395, 1.333921747, 0.192594116, 1.838029829), Z_T0_R1 = c(0.398000559, 1.286528398, 0.129147097, 1.452769794), Z_T0_R2 = c(0.384759325, 1.122251177, 0.119475721, 1.385513609), Z_T0_R3 = c(1.582230097, 0.697419716, 2.406671502, 0.477415567), Z_T1_R1 = c(1.136843842, 0.804552001, 2.13213228, 0.989075996), Z_T1_R2 = c(1.275683837, 1.227821594, 0.31900326, 0.835941568), Z_T1_R3 = c(0.963349308, 0.968589683, 1.706670339, 0.807060135), Z_T2_R1 = c(3.765036263, 0.477443352, 1.712841882, 0.469173869), Z_T2_R2 = c(1.901023385, 0.832736132, 2.223429427, 0.593558769), Z_T2_R3 = c(1.407713024, 0.911920317, 2.011259223, 0.692553388), Z_T3_R1 = c(0.988333629, 1.095130142, 1.648598854, 0.629915612), Z_T3_R2 = c(0.618606729, 0.497458337, 0.549147265, 1.249492088), Z_T3_R3 = c(0.429823986, 0.471389536, 0.977124788, 1.136635484)), row.names = c(NA, -4L ), class = c("data.table", "data.frame"))
Scripts used
library(dplyr)
library(stringr)
library(tidyr)
gdf1 <- gather(df1, "group", "Expression", -Gene)
gdf1$tgroup <- apply(str_split_fixed(gdf1$group, "_", 3)[, c(1, 2)],
1, paste, collapse ="_")
library(dplyr)
tydf1 <- gdf1 %>%
group_by(Gene, tgroup) %>%
summarize(expression_mean = mean(Expression)) %>%
spread(., tgroup, expression_mean)
#1 heatmap script is being used
library(tidyverse)
tydf1 <- tydf1 %>%
as.data.frame() %>%
column_to_rownames(var=colnames(tydf1)[1])
library(gplots)
library(vegan)
randup.m <- as.matrix(tydf1)
scaleRYG <- colorRampPalette(c("red","yellow","darkgreen"),
space = "rgb")(30)
data.dist <- vegdist(randup.m, method = "euclidean")
row.clus <- hclust(data.dist, "aver")
heatmap.2(randup.m, Rowv = as.dendrogram(row.clus),
dendrogram = "row", col = scaleRYG, margins = c(7,10),
density.info = "none", trace = "none", lhei = c(2,6),
colsep = 1:3, sepcolor = "black", sepwidth = c(0.001,0.0001),
xlab = "Identifier", ylab = "Rows")
#2 heatmap script is being used
df2 <- as.matrix(tydf1[, -1])
heatmap(df2)
Also, I want to add a color key.
It is still unclear to me, what the desired output is. There are some notes:
You don't need to use vegdist() to calculate distance matrix for your hclust() call. Because if you check all(vegdist(randup.m, method = "euclidian") == dist(randup.m)) it returns TRUE;
Specifying Colv = F in your heatmap.2() call will prevent reordering of the columns (default is TRUE);
Maybe it is better to scale your data by row (see the uncommented row);
Your call of heatmap.2() returns the heatmap with color key.
So summing it up - in your first script you just miss the Colv = F argument, and after a little adjustment it looks like this:
heatmap.2(randup.m,
Rowv = as.dendrogram(row.clus),
Colv = F,
dendrogram = "row",
#scale = "row",
col = scaleRYG,
density.info = "none",
trace = "none",
srtCol = -45,
adjCol = c(.1, .5),
xlab = "Identifier",
ylab = "Rows"
)
However I am still not sure - is it what you need?
I'm trying to pass a set of modified arguments from a larger function to arguments in a nested function. This is an argument supplied from the larger function:
time_dep_covariates_list = c(therapy_start = "Start of Therapy",
therapy_end = "End of Therapy")
I have these sets of constant arguments:
tmerge_args_1 <- alist(data1 = analytic_dataset,
data2 = analytic_dataset,
id = patientid,
tstop = adv_dx_to_event,
death_censor = event(adv_dx_to_event))
And I want to append these modified arguments to that argument list:
tmerge_args_2 <- lapply(1:length(time_dep_covariates_list), function(x){
tmerge_args <<- c(tmerge_args, alist('var' = tdc(var)) )
paste0(names(time_dep_covariates_list[x])," =
tdc(",names(time_dep_covariates_list[x]), ")")
})
> tdc_args
[[1]]
[1] "therapy_start = tdc(therapy_start)"
[[2]]
[1] "therapy_end = tdc(therapy_end)"
I want to create a do.call that handles the arguments like so:
count_process_form <- do.call(tmerge, args = c(tmerge_args_1,
tmerge_args_2)
That would be identical to the following:
tmerge(data1 = analytic_dataset, data2 = analytic_dataset,
id = patientid, tstop = adv_dx_to_event,
therapy_start = tdc(therapy_start), therapy_end = tdc(therapy_end)
It works fine with tmerge_args_1 by itself, but as the args_2 are character and not language elements, I get this error:
Error in (function (data1, data2, id, ..., tstart, tstop, options) :
all additional argments [sic] must have a name:
How can I modify the list I'm creating for args_2 so they're stored as arguments that do.call can understand? Or am I approaching this all wrong?
Thanks!
Here is a reproducible example:
analytic_dataset= data_frame(patientid = sample(1:1000,5),
adv_dx_to_event = sample(100:200, 5),
death_censor = sample(0:1,5, replace = T),
therapy_start = sample(1:20,5),
therapy_stop = sample(40:100,5))
The below would be passed in from a function:
time_dep_covariates_list = c(therapy_start = "Start of Therapy",
therapy_end = "End of Therapy")
tmerge_args_1 <- alist(data1 = analytic_dataset,
data2 = analytic_dataset,
id = patientid,
tstop = adv_dx_to_event,
death_censor = event(adv_dx_to_event))
do.call(tmerge,tmerge_args_1) #this works
tmerge_args_2 <- lapply(1:length(time_dep_covariates_list), function(x){
tmerge_args <<- c(tmerge_args, alist('var' = tdc(var)) )
paste0(names(time_dep_covariates_list[x])," = tdc(",names(time_dep_covariates_list[x]), ")")
})
do.call(tmerge,tmerge_args_1,tmerge_args_2) # this doesn't```