Why my graphic is empty? - r

I have a problem with structure.
Here is an exemple of what I'm doing.
x <- c(211.50, 200.50, 148.60, 144.20, 132.20, 159.80, 107.70, 91.40, 63.10, 62.10, 55.70, 74.60, 224.90, 208.001, 45.80, 133.50, 122.70, 161.70, 160.00, 136.80, 92.20, 91.20, 79.20, 109.90, 244.60, 212.20, 147.20, 129.30, 118.50, 165.80, 120.60, 97.90, 69.30, 65.50, 59.10, 81.90, 94.15, 114.20, 131.03, 133.89, 132.25, 153.51)
y <- x
Ref <- c(rep("ref1",36), rep("ref2",6))
ID <- c(rep("id1",6), rep("id2",6),rep("id3",6),rep("id4",6),rep("id5",6),rep("id6",6),rep("id7",6))
data.split <- data.frame(Ref,ID,x,y)
l.ref <- ddply(data.split, .(Ref), "nrow")
vec1 <- c(rep(1,l.ref$nrow[1]))
for (i in 2:length(l.ref$Ref)) {
vec2 <- c(rep(i,l.ref$nrow[i]))
vec3 <- append(vec1,vec2, after =length(vec1))
vec1 <- vec3
}
vec_ref <- vec3
l.id <- ddply(data.split, .(ID), "nrow")
vec1 <- c(rep(1,l.id$nrow[1]))
for (i in 2:length(l.id$ID)) {
vec2 <- c(rep(i,l.id$nrow[i]))
vec3 <- append(vec1,vec2, after =length(vec1))
vec1 <- vec3
}
vec_id <- vec3
df <- structure(list(Ref = structure(vec_ref, .Label = l.ref$Ref, class = "factor"),
Id = structure(vec_id, .Label = l.id$ID, class = "factor"),
x = data.split$x, y = data.split$y),
.Names = c("Ref", "Id", "x", "y"),
row.names = c(NA, -length(data.split$x)), class = "data.frame")
ggplot(data = df, aes(x = x, y = y, colour = df$Ref)) +
geom_point(aes(shape = df$Id)) + scale_shape_manual(value=1:length(l.id$ID))
Warning messages:
1: In [<-.factor(*tmp*, is.na(values), value = "NA") :
invalid factor level, NAs generated
2: Removed 42 rows containing missing values (geom_point).
I have empty graphic, I don't understand what is the problem?
What I'm doing wrong?

You've messed up your factors in your data frame. Whats all that business with 'structure'? Lose it, and use this:
df = data.frame(Ref = factor(vec_ref,labels=l.ref$Ref),
Id = factor(vec_id,labels=l.id$ID),
x=data.split$x,y=data.split$y)
and then the ggplot works.

Related

Trying to create an R function which finds the input value in column 1 of a dataframe and returns column 2 value of the same row

New to R functions, I have a dataframe which looks like this except about 10,000 rows long:
Gene.name
Ortho.name
abc
DEF
qrs
TUV
wx
YZ
I'm trying to create a really simple function in r which when I input qrs, returns TUV. If someone could help I would really appreciate it.
fun <- function(vec, data) data$Ortho.name[ match(vec, data$Gene.name) ]
Z <- structure(list(Gene.name = c("abc", "qrs", "wx"), Ortho.name = c("DEF", "TUV", "YZ")), class = "data.frame", row.names = c(NA, -3L))
fun("qrs", data = Z)
# [1] "TUV"
fun("nothing", data = Z)
# [1] NA
fun(c("qrs", "abc", "not found"), data = Z)
# [1] "TUV" "DEF" NA
In case anyone is using seurat for plotting orthologs in a cross-species comparison, this is how I implemented the above using orthologs from BioMart:
chick_fish_ortho <- read.csv("chick_orthos.csv")
mac_fish_ortho <- read.csv('mac_orthos.csv')
macfun('glula', mac_fish_ortho)
chickfun('glula', chick_fish_ortho)
chickfun <- function(vec, data) data$Chicken.gene.name[ match(vec, data$Gene.name) ]
macfun <- function(vec, data) data$Macaque.gene.name[ match(vec, data$Gene.name) ]
fish_chick_mac <- function(gene, chickdata, macdata) {
p1 = FeaturePlot(object = fish_MG, reduction = "umap", label = TRUE, min.cutoff = 0, features = gene)
p2 = FeaturePlot(object = chick_MG, reduction = "umap", label = TRUE, min.cutoff = 0, features = chickfun(gene, chickdata))
p3 = FeaturePlot(object = mac_MG, reduction = "umap", label = TRUE, min.cutoff = 0, features = macfun(gene, macdata))
p1 + p2 + p3
}
fish_chick_mac('glula', chick_fish_ortho, mac_fish_ortho)

formatting ggplot scales with s3 method and objects attributes

I have two custom classes that have different types of constructors and different types of formatting methods, one class relying on the attributes of the object.
as.test_1 <- function(x, test_attribute, ...) {
attributes(x) <- list("test_attribute" = test_attribute)
x <-
structure(x, class = c("test_1", setdiff(class(x), "test_1")))
return(x)
}
format.test_1 <- function(x, ...) {
paste0(x, attributes(x)$test_attribute)
}
print.test_1 <- function(x, ...) {
cat(format(x, ...), "\n")
}
as.test_2 <- function(x, test_attribute, ...) {
x <-
structure(x, class = c("test_2", setdiff(class(x), "test_2")))
return(x)
}
format.test_2 <- function(x, ...) {
paste0(x, "test")
}
print.test_2 <- function(x, ...) {
cat(format(x, ...), "\n")
}
Now I want to to plot some data that contains these classes:
library(data.table)
data_1 <-
data.table(a = as.test_1(c(1, 2, 3, 4), test_attribute = "test"),
b = as.factor(c("a", "b", "c", "d")))
data_2 <-
data.table(a = as.test_2(c(1, 2, 3, 4), test_attribute = "test"),
b = as.factor(c("a", "b", "c", "d")))
Also I want to use the format method to format the y-axis. This is my first try, which works for the test_2 class that does not rely on the attributes of the variable for formatting:
library(ggplot2)
plot_data <- function(data) {
g <- ggplot(data, aes(x = b, y = a))
g <- g + geom_bar(stat = "identity")
expand <- c(0, 0)
attributes(expand) <- attributes(data$a)
limits <- c(0, 4)
attributes(limits) <- attributes(data$a)
g <- g + scale_y_continuous(
expand = expand,
limits = limits,
oob = scales::squish,
labels = getS3method(f = "format", class = class(data$a)[[1]])
)
return(print(g))
}
However for the test_1 class it seems like the attributes get lost somewhere in the call of scale_y_continuous.
plot_data(data_1)
plot_data(data_2)
Does someone know how to fix this?
Thanks!

passing references of a vector as a target variable of the rpart formulae inside a loop in R

sample data:
x <- sample(c("aa", "bb", "cc", NA), 5000, replace = TRUE)
y <- sample(c("mm", "nn", "pp", NA), 5000, replace = TRUE)
z <- sample(c("uu", "vv", "ww", NA), 5000, replace = TRUE)
m <- sample(c(1:99, NA), replace = TRUE)
data <- data.frame(x, y, z, m)
creating a vector of Target variable for rpart
colname <- names(data[ , -m])
passing reference by colname index as target variable to rpart for imputing NA values in x, y , z:
for(i in seq_along(colname)) {
fm <- as.formula(paste0(colname[i], "~ ."))
mod <- rpart(fm, data = data, method = "class")
i1 <- is.na(data[[colname[i]]])
pred <- predict(mod, data[[colname[i]]][i1], type = "class")
data[[colname[i]]][i1] <- pred[i1]
}
i get the following error:
Error in model.frame.default(Terms, newdata, na.action = na.action, xlev = attr(object, :
variable lengths differ (found for 'm')
We create the formula with paste and then do the assignment only for the NA elements
library(rpart)
for(i in seq_along(colname)) {
fm <- as.formula(paste0(colname[i], "~ ."))
m <- rpart(fm, data = data, method = "class")
i1 <- is.na(data[[colname[i]]])
pred <- predict(m, data[c(colname[i], "m")], type = "class")[i1]
data[[colname[i]]][i1] <- pred[i1]
}
colSums(is.na(data))
# x y z
# 0 0 0
Update
With the updated dataset, subset the data in the rpart
colname <- names(data)[1:3]
for(i in seq_along(colname)) {
fm <- as.formula(paste0(colname[i], "~ ."))
m <- rpart(fm, data = data[colname], method = "class")
i1 <- is.na(data[[colname[i]]])
pred <- predict(m, data[[colname[i]]][i1], type = "class")
data[[colname[i]]][i1] <- pred[i1]
}
data
set.seed(24)
x <- sample(c("aa", "bb", "cc", NA), 5000, replace = TRUE)
y <- sample(c("mm", "nn", "pp", NA), 5000, replace = TRUE)
z <- sample(c("uu", "vv", "ww", NA), 5000, replace = TRUE)
data <- data.frame(x, y, z)

Custom column names as arguments in the functions of stability R package

I developed the stability R package which can be installed from CRAN.
install.packages("stability")
However, I have difficulty in making it to take custom column names as function arguments. Here is an example of add_anova function
library(stability)
data(ge_data)
YieldANOVA <-
add_anova(
.data = ge_data
, .y = Yield
, .rep = Rep
, .gen = Gen
, .env = Env
)
YieldANOVA
The above code works fine. However, when I change the column names of the data.frame, it doesn't work as below:
df1 <- ge_data
names(df1) <- c("G", "Institute", "R", "Block", "E", "Y")
fm1 <-
add_anova(
.data = df1
, .y = Y
, .rep = Rep
, .gen = G
, .env = E
)
Error in model.frame.default(formula = terms(.data$Y ~ .data$E + .data$Rep:.data$E + :
invalid type (NULL) for variable '.data$Rep'
Similarly another function stab_reg
fm1Reg <-
stab_reg(
.data = df1
, .y = Y
, .gen = G
, .env = E
)
Error in eval(predvars, data, env) : object 'Gen' not found
The codes of these functions can be accessed by
getAnywhere(add_anova.default)
function (.data, .y, .rep, .gen, .env)
{
Y <- enquo(.y)
Rep <- enquo(.rep)
G <- enquo(.gen)
E <- enquo(.env)
fm1 <- lm(formula = terms(.data$Y ~ .data$E + .data$Rep:.data$E +
.data$G + .data$G:.data$E, keep.order = TRUE), data = .data)
fm1ANOVA <- anova(fm1)
rownames(fm1ANOVA) <- c("Env", "Rep(Env)", "Gen", "Gen:Env",
"Residuals")
fm1ANOVA[1, 4] <- fm1ANOVA[1, 3]/fm1ANOVA[2, 3]
fm1ANOVA[2, 4] <- NA
fm1ANOVA[1, 5] <- 1 - pf(as.numeric(fm1ANOVA[1, 4]), fm1ANOVA[1,
1], fm1ANOVA[2, 1])
fm1ANOVA[2, 5] <- 1 - pf(as.numeric(fm1ANOVA[2, 4]), fm1ANOVA[2,
1], fm1ANOVA[5, 1])
class(fm1ANOVA) <- c("anova", "data.frame")
return(list(anova = fm1ANOVA))
}
<bytecode: 0xc327c28>
<environment: namespace:stability>
and
getAnywhere(stab_reg.default)
function (.data, .y, .rep, .gen, .env)
{
Y <- enquo(.y)
Rep <- enquo(.rep)
G <- enquo(.gen)
E <- enquo(.env)
g <- length(levels(.data$G))
e <- length(levels(.data$E))
r <- length(levels(.data$Rep))
g_means <- .data %>% dplyr::group_by(!!G) %>% dplyr::summarize(Mean = mean(!!Y))
names(g_means) <- c("G", "Mean")
DataNew <- .data %>% dplyr::group_by(!!G, !!E) %>% dplyr::summarize(GEMean = mean(!!Y)) %>%
dplyr::group_by(!!E) %>% dplyr::mutate(EnvMean = mean(GEMean))
IndvReg <- lme4::lmList(GEMean ~ EnvMean | Gen, data = DataNew)
IndvRegFit <- summary(IndvReg)
StabIndvReg <- tibble::as_tibble(data.frame(g_means, Slope = coef(IndvRegFit)[,
, 2][, 1], LCI = confint(IndvReg)[, , 2][, 1], UCI = confint(IndvReg)[,
, 2][, 2], R.Sqr = IndvRegFit$r.squared, RMSE = IndvRegFit$sigma,
SSE = IndvRegFit$sigma^2 * IndvRegFit$df[, 2], Delta = IndvRegFit$sigma^2 *
IndvRegFit$df[, 2]/r))
MeanSlopePlot <- ggplot(data = StabIndvReg, mapping = aes(x = Slope,
y = Mean)) + geom_point() + geom_text(aes(label = G),
size = 2.5, vjust = 1.25, colour = "black") + geom_vline(xintercept = 1,
linetype = "dotdash") + geom_hline(yintercept = mean(StabIndvReg$Mean),
linetype = "dotdash") + labs(x = "Slope", y = "Mean") +
scale_x_continuous(sec.axis = dup_axis(), labels = scales::comma) +
scale_y_continuous(sec.axis = dup_axis(), labels = scales::comma) +
theme_bw()
return(list(StabIndvReg = StabIndvReg, MeanSlopePlot = MeanSlopePlot))
}
<bytecode: 0xe431010>
<environment: namespace:stability>
One of the problems in the data 'df1' is the column name is 'R' instead of "Rep" which was passed into the function. Second, the terms passed into the formula are quosures. we could change it to string with quo_names and then construct formula with paste
add_anova1 <- function (.data, .y, .rep, .gen, .env) {
y1 <- quo_name(enquo(.y))
r1 <- quo_name(enquo(.rep))
g1 <- quo_name(enquo(.gen))
e1 <- quo_name(enquo(.env))
fm <- formula(paste0(y1, "~", paste(e1, paste(r1, e1, sep=":"),
g1, paste(g1, e1, sep=":"), sep="+")))
fm1 <- lm(terms(fm, keep.order = TRUE), data = .data)
fm1ANOVA <- anova(fm1)
rownames(fm1ANOVA) <- c("Env", "Rep(Env)", "Gen", "Gen:Env",
"Residuals")
fm1ANOVA[1, 4] <- fm1ANOVA[1, 3]/fm1ANOVA[2, 3]
fm1ANOVA[2, 4] <- NA
fm1ANOVA[1, 5] <- 1 - pf(as.numeric(fm1ANOVA[1, 4]), fm1ANOVA[1,
1], fm1ANOVA[2, 1])
fm1ANOVA[2, 5] <- 1 - pf(as.numeric(fm1ANOVA[2, 4]), fm1ANOVA[2,
1], fm1ANOVA[5, 1])
class(fm1ANOVA) <- c("anova", "data.frame")
return(list(anova = fm1ANOVA))
}
YieldANOVA2 <- add_anova1(
.data = df1
, .y = Y
, .rep = R
, .gen = G
, .env = E
)
-checking with the output generated using 'ge_data' without changing the column names
all.equal(YieldANOVA, YieldANOVA2, check.attributes = FALSE)
#[1] TRUE
Similarly stab_reg could be changed

Create a one-row dataframe where every column is the same as an existing variable

I often write code like this:
answer.df = data.frame(x = numeric(0), y = numeric(0), z = numeric(0))
for (i in 1:100) {
x = do_stuff(i)
y = do_more_stuff(i)
z = yet_more_stuff(i)
# Is there a better way of doing this:
temp.df = data.frame(x = x, y = y, z = z)
answer.df = rbind(answer.df, temp.df)
}
My question is, in the line temp.df = data.frame(x = x, y = y, z = z), is there a neater way of doing this? Imagine it with ten or more variables to understand my problem.
Try this:
do.call("rbind", lapply(1:100, function(i) list(x = xfun(i), y = yfun(i))))
Also try rbindlist from data.table which may have some performance advantages:
library(data.table)
rbindlist(lapply(1:100, function(i) list(x = xfun(i), y = yfun(i))))

Resources