Applying lm() using sapply or lapply - r

So I'm trying to use lm() with sapply.
#example data and labels
data <- matrix(data = runif(1000), nrow = 100, ncol = 10))
markers <- sample(0:1, replace = T, size = 100)
# try to get linear model stuff
Lin <- sapply(data, function(x) lm(unlist(markers) ~ unlist(x))$coefficients)
MY problem is that this gives me coefficients for 1000 equations rather than 10

You need to supply sapply with a data frame, not a matrix.
#example data and labels
data <- data.frame(matrix(data = runif(1000), nrow = 100, ncol = 10))
markers <- sample(0:1, replace = T, size = 100)
# try to get linear model stuff
sapply(data, function(x) coef(lm(markers ~ x)))
sapply(data, function(x) coef(lm(markers ~ x))[-1]) # Omit intercepts
X1.x X2.x X3.x X4.x X5.x
0.017043626 0.518378546 -0.011110972 -0.145848478 0.335232991
X6.x X7.x X8.x X9.x X10.x
0.015122184 0.001985933 0.191279594 -0.077689961 -0.107411203
Your original matrix fails:
data <- matrix(data = runif(1000), nrow = 100, ncol = 10)
sapply(data, function(x) coef(lm(markers ~ x)))
# Error: variable lengths differ (found for 'x')
Because sapply, which calls lapply, will convert its first argument, X, to a list using as.list before performing the function. But as.list applied to a matrix results in list with length equal to the number of entries in the matrix, in your case 1,000. as.list when applied to a data frame results in a list with length equal to the number of columns of the data frame, in your case 10, with the elements containing the values in each column.
> lapply
function (X, FUN, ...)
{
FUN <- match.fun(FUN)
if (!is.vector(X) || is.object(X))
X <- as.list(X)
.Internal(lapply(X, FUN))
}
<bytecode: 0x000002397f5ce508>
<environment: namespace:base>

Related

How to Loop Through Specific Column Names in R

How can you iterate in a for loop with specific column names in R? This is the dataset I am using and below are the names of the columns I want to iterate. Also are the column number.
When I try to iterate, it does not compile. I need this to create a multiple cluster data visualization.
if (!require('Stat2Data')) install.packages('Stat2Data')
library(Stat2Data)
data("Hawks")
#summary(Hawks)
for (i in 10:13(Hawks)){
print(Hawks$ColumnName)
}
for (i in Hawks(c("Wing","Weight","Culmen","Hallux"))){
print(Hawks$ColumnName)
}
EDIT
After what Martin told me, this error occurs:
Error in [.data.frame`(Hawks, , i) : undefined columns selected
This is the code I have:
if(!require('DescTools')) {
install.packages('DescTools')
library('DescTools')
}
Hawks$Wing[is.na(Hawks$Wing)] <- mean(Hawks$Wing, na.rm = TRUE)
Hawks$Weight[is.na(Hawks$Weight)] <- mean(Hawks$Weight, na.rm = TRUE)
Hawks$Culmen[is.na(Hawks$Culmen)] <- mean(Hawks$Culmen, na.rm = TRUE)
Hawks$Hallux[is.na(Hawks$Hallux)] <- mean(Hawks$Hallux, na.rm = TRUE)
# Parámetro Wing
n <- nrow(Hawks) # Number of rows
for (col_names in 10:13){
x <- matrix(Hawks[, i],0.95*n)
#x <- rbind(x1,x2)
plot (x)
fit2 <- kmeans(x, 2)
y_cluster2 <- fit2$cluster
fit3 <- kmeans(x, 3)
y_cluster3 <- fit3$cluster
fit4 <- kmeans(x, 4)
y_cluster4 <- fit4$cluster
}

Condition to check values ​between lists and add to new list in R

If the last value of each sublist in the list ListResiduals (e.g: OptionAOptionD) is > than the value with the corresponding name in the ListSigma (e.g: OptionAOptionD), it adds the name (e.g: OptionAOptionD) to the Watchlist list.
In the last line of the code I put "> 5" just for the example work, it's the "> 5" that I want to replace in the condition that I mentioned in the previous paragraph.
DF <- data.frame("OptionA" = sample(1:100, 50),
"OptionB" = sample(1:100, 50),
"OptionC" = sample(1:100, 50),
"OptionD" = sample(1:100, 50))
#Unfolding options and creating DF
UnFolding <- data.frame(
First = as.vector(sapply(names(DF[]), function(x)
sapply(names(DF[]), function(y)
paste0(x)))),
Second = as.vector(sapply(names(DF[]), function(x)
sapply(names(DF[]), function(y)
paste0(y)))))
#Deleting lines with the same names
UnFolding <-
UnFolding[UnFolding$First != UnFolding$Second, ]
#Creating list with dependent and independent variables
LMList <- apply(UnFolding, 1, function(x)
as.formula(paste(x[1], "~", x[2])))
#Change list data to variable names
names(LMList) <- substring(lapply(LMList, paste, collapse = ""), 2)
#Linear regression - lm()
LMListRegression <- lapply(LMList, function(x) {
eval(call("lm", formula = x, data = DF))
})
#Residuals
ListResiduals <- lapply(LMListRegression, residuals)
#Sigma
ListSigma <- lapply(LMListRegression, function(x) {
sigma(x)*2
})
#Watchlist
Watchlist <- as.list(unlist(lapply(ListResiduals,
function(x) names(x)[1][tail(x, 1) > 5])))
I would gravitate towards converting your Simga and Residual values to a vector and compare the vectors. You could also use a data.frame approach to be sure the order of your lists/vectors doesn't change.
# create a vector with the last value from the Residuals list.
last_residual <- sapply(ListResiduals, `[`, 50)
names(last_residual) <- substr(names(last_residual), 1, stop = -4)
# Using sapply() rather than lapply, will return a named vector
sigma_vector <- sapply(LMListRegression, function(x) {
sigma(x)*2
})
Watchlist <- sigma_vector[last_residual > sigma_vector]
Watchlist
# named numeric(0)
In your example, it returns an empty named vector because no values meet your condition
max(last_residual)
# [1] 31.70949
min(sigma_vector)
# [1] 52.93234
# To demonstrate that it works, let's devide sigma by 2 so that at least some values will pass
half_sigma <- sigma_vector/2
Watchlist2 <- sigma_vector[last_residual > half_sigma]
Watchlist2
# OptionDOptionA OptionDOptionB OptionDOptionC
# 54.52411 57.09503 56.79341

Performing a large number of 2-sample t-tests in R

So I am creating a function which allows me to take a data.frame and get a dataframe of p.values for each variable tested.
# data and labels
my_data <- data.frame(matrix(data = rnorm(10000), nrow = 100, ncol = 10000))
labels <- sample(0:1, 100, replace = TRUE)
# append the labels to the data, then filter
my_data$labels <- labels
sample_1 <- dplyr::filter(.data = my_data, labels == 0)
sample_2 <- dplyr::filter(.data = my_data, labels == 1)
#perform a t-test on each column
p_vals <- data.frame()
for(i in c(1:10000)) {
p_vals <- rbind(p_vals, t.test(x = sample_1[,i], y = sample_2[,i])$p.value)
}
return(p_vals)
This is functional, but I think/hope there would be a more efficient way to do this without the for loop. The data should be in rows because for later functions it will be important to keep track of which variable had which p value.
Instead of splitting the samples you can use the formula interface to t.test, and sapply over the columns of my_data to conduct the tests:
p_vals <- sapply( my_data, function(x) t.test(x ~ labels)$p.value )
This will make a vector of p-values, the order will be the same as the columns of my_data
You can also use the package genefilter:
library(genefilter)
colttests(as.matrix(my_data[,-ncol(my_data)]),factor(my_data$labels))

Apply function or Loop in R: Not numerical, returning NA

I am working with a resampling procedure in R (just like a bootstrap). I have a matrix of response/explanatory variables and would like to make 999 samples of this matrix to calculate for each statistic I am working their mean, sd and confidence interval. So, I wrote a function to calculate and to return a list:
mydata <- data.frame(a=rnorm(20, 1, 1), b = rnorm(20,1,1))
myfun <- function(data, n){
sample <- data[sample(n, replace = T),]
model1 <- lm(sample[,1]~sample[,2])
return(list(model1[[1]][[1]], model1[[1]][[2]]))
}
result <- as.numeric()
result <- replicate(99, myfun(mydata, 10))
Then, I have a matrix as my output in which the rows are the statistics and the columns are the samplings (nrow = 2 and ncol = 99). I need the mean and sd for each row, but when I try to use the apply function or even a loop the following message shows up:
In mean.default(newX[, i], ...) :
argument is not numeric or logical: returning NA
Moreover:
is.numeric(result)
[1] FALSE
I found it strange, because I never had such problem with similar procedures.
Any thoughts?
Use the following:
myfun <- function(dat, n){
dat1 <- dat[sample(n, replace = T),]
model1 <- lm(dat1[,1] ~ dat1[,2])
return(coef(model1))
}
replicate(99, myfun(mydata, 10))
The reason is the 'result' is a list of 198 elements with dimension attributes. We need to unlist the 'result' and provide the dimension attributes
result1 <- `dim<-`(unlist(result), dim(result))
and then use the apply
Just replace list() by c() in your myfun() function
mydata <- data.frame(a=rnorm(20, 1, 1), b = rnorm(20,1,1))
myfun <- function(data, n){
sample <- data[sample(n, replace = T),]
model1 <- lm(sample[,1]~sample[,2])
return(c(model1[[1]][[1]], model1[[1]][[2]]))
}
result <- as.numeric()
result <- replicate(99, myfun(mydata, 10))
apply(result, FUN=mean, 1)
apply(result, FUN=sd, 1)
This worked for me:
mydata <- data.frame(a=rnorm(20, 1, 1), b = rnorm(20,1,1))
myfun <- function(data, n){
sample <- data[sample(n, replace = T),]
model1 <- lm(sample[,1]~sample[,2])
return(data.frame(v1 = model1[[1]][[1]], v2 = model1[[1]][[2]]))
}
result <- do.call("rbind",(replicate(99, myfun(mydata, 10), simplify = FALSE)))

p-value matrix of x and y variables from anova output

I have many X and Y variables (something like, 500 x 500). The following just small data:
yvars <- data.frame (Yv1 = rnorm(100, 5, 3), Y2 = rnorm (100, 6, 4),
Yv3 = rnorm (100, 14, 3))
xvars <- data.frame (Xv1 = sample (c(1,0, -1), 100, replace = T),
X2 = sample (c(1,0, -1), 100, replace = T),
Xv3 = sample (c(1,0, -1), 100, replace = T),
D = sample (c(1,0, -1), 100, replace = T))
I want to extact p-values and make a matrix like this:
Yv1 Y2 Yv3
Xv1
X2
Xv3
D
Here is my attempt to loop the process:
prob = NULL
anova.pmat <- function (x) {
mydata <- data.frame(yvar = yvars[, x], xvars)
for (i in seq(length(xvars))) {
prob[[i]] <- anova(lm(yvar ~ mydata[, i + 1],
data = mydata))$`Pr(>F)`[1]
}
}
sapply (yvars,anova.pmat)
Error in .subset(x, j) : only 0's may be mixed with negative subscripts
What could be the solution ?
Edit:
For the first Y variable:
For first Y variable:
prob <- NULL
mydata <- data.frame(yvar = yvars[, 1], xvars)
for (i in seq(length(xvars))) {
prob[[i]] <- anova(lm(yvar ~ mydata[, i + 1],
data = mydata))$`Pr(>F)`[1]
}
prob
[1] 0.4995179 0.4067040 0.4181571 0.6291167
Edit again:
for (j in seq(length (yvars))){
prob <- NULL
mydata <- data.frame(yvar = yvars[, j], xvars)
for (i in seq(length(xvars))) {
prob[[i]] <- anova(lm(yvar ~ mydata[, i + 1],
data = mydata))$`Pr(>F)`[1]
}
}
Gives the same result as above !!!
Here is an approach that uses plyr to loop over the columns of a dataframe (treating it as a list) for each of the xvars and yvars, returning the appropriate p-value, arranging it into a matrix. Adding the row/column names is just extra.
library("plyr")
probs <- laply(xvars, function(x) {
laply(yvars, function(y) {
anova(lm(y~x))$`Pr(>F)`[1]
})
})
rownames(probs) <- names(xvars)
colnames(probs) <- names(yvars)
Here is one solution, which consists in generating all combinations of Y- and X-variables to test (we cannot use combn) and run a linear model in each case:
dfrm <- data.frame(y=gl(ncol(yvars), ncol(xvars), labels=names(yvars)),
x=gl(ncol(xvars), 1, labels=names(xvars)), pval=NA)
## little helper function to create formula on the fly
fm <- function(x) as.formula(paste(unlist(x), collapse="~"))
## merge both datasets
full.df <- cbind.data.frame(yvars, xvars)
## apply our LM row-wise
dfrm$pval <- apply(dfrm[,1:2], 1,
function(x) anova(lm(fm(x), full.df))$`Pr(>F)`[1])
## arrange everything in a rectangular matrix of p-values
res <- matrix(dfrm$pval, nc=3, dimnames=list(levels(dfrm$x), levels(dfrm$y)))
Sidenote: With high-dimensional datasets, relying on the QR decomposition to compute the p-value of a linear regression is time-consuming. It is easier to compute the matrix of Pearson linear correlation for each pairwise comparisons, and transform the r statistic into a Fisher-Snedecor F using the relation F = νar2/(1-r2), where degrees of freedom are defined as νa=(n-2)-#{(xi=NA),(yi=NA)} (that is, (n-2) minus the number of pairwise missing values--if there're no missing values, this formula is the usual coefficient R2 in regression).

Resources