I'm trying to write a loop that performs anova and TukeyHSD on my data across 3 samples for each "Label". Label in this case is a metabolic pathway. Data that goes into it are the genes expressed in said metabolic pathway.
For the test data, I created a small df that reproduces my error. In my actual data, I'm hoping to do perform this across 2 factors (not just one) and I have thousands of more rows.
library(reshape2)
df<-melt(data.frame(sample1 = c(0,0,3,4,5,1),sample2 = c(1,0,0,4,5,0),sample3 = c(0,0,0,8,0,0),Label = c("TCA cycle", "TCA cycle","TCA cycle", "Glycolysis","Glycolysis","Glycolysis"),Gene = c("k1","k2","k3","k4","k5","k6")))
My approach (annotated the best way I can!):
fxn<-unique(df$Label) #create list
for (i in 1:length(fxn)){
if (!exists("data")){ #if the "data" dataframe does not exist, start here!
depth<-aov(df$value[df$Label==fxn[i]]~df$variable[df$Label==fxn[i]]) #perform anova on my "df", gene values as a factor of samples (for each "fxn")
hsd<-TukeyHSD(depth) #calculate tukeyHSD
data<-as.data.frame(hsd$`df$variable[df$Label == fxn[i]]`) #grab dataframe of tukey HSD output
data$Label<-fxn[i] #add in the Label name as a column (so it looks like my original df, but with TukeyHSD output for each pairwise comparison
data<-as.data.frame(data)
}
if (exists("data")){ #if "data" exists, do this:
tmpdepth<-aov(df$value[df$Label==fxn[i]]~df$variable[df$Label==fxn[i]])
tmphsd<-TukeyHSD(tmpdepth)
tmpdata<-as.data.frame(tmphsd$`df$variable[df$Label == fxn[i]]`)
tmpdata$Label<-fxn[i]
tmpdata<-as.data.frame(tmpdata)
data<-rbind(data,tmpdata) #combine with original data
data<-as.data.frame
rm(tmpdata)
}
}
I'd like my output to look like this:
diff lwr upr p adj Label
sample2-sample1 -0.3333333 -8.600189 7.933522 0.9916089 Glycolysis
sample3-sample1 -0.6666667 -8.933522 7.600189 0.9669963 Glycolysis
sample3-sample2 -0.3333333 -8.600189 7.933522 0.9916089 Glycolysis
but the Label column has all the factors that went into "fxn".
Errors:
Error in rep(xi, length.out = nvar) :
attempt to replicate an object of type 'closure'
You forgot the second data in the last line before rm(tmpdata). It should be:
data<-as.data.frame(data)
I my implementation I changed your code as follows:
datav <- data.frame(diff = double(),
lwr = double(),
upr = double(),
'p adj' = double(),
'Label' = character())
for (fxn in unique(df$Label)){
depth <- aov(df$value[df$Label==fxn] ~ df$variable[df$Label==fxn])
hsd <- TukeyHSD(depth)
tmp <- as.data.frame(hsd$`df$variable[df$Label == fxn]`)
tmp$Label <- fxn
datav <- rbind(datav, tmp)
}
Initializing the data.frame before hand you do not need the if statement. Also data is a function in R, so I rename the variable data to datav.
Related
I have run several (17) meta-analyses (identified by specific names) and I need to extract the models' outputs into one single table, as well as add a column with the name of each name. I have done it manually, but I was wondering if I could build a loop to do so.
I'm attaching the first three of the 17 analyses, the "names" being "cent", "dist", and "sqrs"
#meta-analyses
res_cent<-rma.mv(yi, vi, mods = ~ factor(drug)-1, random = list(~ 1 | publication_id,~ 1 | strain_def),
data = SR_meta,subset=(SR_meta$measure=="cent"))
res_dist<-rma.mv(yi, vi, mods = ~ factor(drug)-1, random = list(~ 1 | publication_id,~ 1 | strain_def),
data = SR_meta,subset=(SR_meta$measure=="dist"))
res_sqrs<-rma.mv(yi, vi, mods = ~ factor(drug)-1, random = list(~ 1 | publication_id,~ 1 | strain_def),
data = SR_meta,subset=(SR_meta$measure=="sqrs"))
#Creating list for model output - cent
list_cent<-coef(summary(res_cent))
list_cent<-setNames(cbind(rownames(list_cent), list_cent, row.names = NULL),
c("Drug", "Estimate", "se","zval","p-value","CI_l","CI_u"))
df_cent <- list_cent[ -c(3,4) ]
df_cent$Drug<-gsub("factor*","",df_cent$Drug)
df_cent$Drug<-gsub("drug*","",df_cent$Drug)
df_cent$Drug<-gsub("[[:punct:]]","",df_cent$Drug)
n_cent<-plyr::count(cent_sum2, vars = "drug")
names(n_cent)[names(n_cent) == "freq"] <- "n_cent"
df_cent<-cbind(df_cent,n_cent[2])
##same thing can be repeated for the other two measures "dist", and "sqrs".
The output is a data frame that contains the name of the drugs used as factors in the meta-analyses, their estimated effect sizes, p-values, confidence intervals, and how many measures we have per factor (n). I want to compile all of these outputs in a table, (at the end of the code called "matrix_ps") and add a column with the name of the measures.
I have done all the steps manually (below) but it looks extremely inefficient.
Is there a way to create a loop to do this, in which the all the names of the measures are changed an then outcome is appended?
Something like
measures<-c("cent","dist","sqrs")
for(i in measures) - not sure how to continue?
matrix_cent<-data.frame(df_cent$Drug,list_cent$`p-value`,df_cent$n_cent,df_cent$Estimate,df_cent$CI_l,df_cent$CI_u)
matrix_dist<-data.frame(df_dist$Drug,list_dist$`p-value`,df_dist$n_dist,df_dist$Estimate,df_dist$CI_l,df_dist$CI_u)
matrix_sqrs<-data.frame(df_sqrs$Drug,list_sqrs$`p-value`,df_sqrs$n_sqrs,df_sqrs$Estimate,df_sqrs$CI_l,df_sqrs$CI_u)
matrix_cent$measure<-"cent"
matrix_dist$measure<-"dist"
matrix_sqrs$measure<-"sqrs"
matrix_cent<-matrix_cent%>% rename(drug=df_cent.Drug,measure=measure,p=list_cent..p.value.,n=df_cent.n_cent,estimate=df_cent.Estimate,ci_low=df_cent.CI_l,ci_up=df_cent.CI_u)
matrix_dist<-matrix_dist%>% rename(drug=df_dist.Drug,measure=measure,p=list_dist..p.value.,n=df_dist.n_dist,estimate=df_dist.Estimate,ci_low=df_dist.CI_l,ci_up=df_dist.CI_u)
matrix_sqrs<-matrix_sqrs%>% rename(drug=df_sqrs.Drug,measure=measure,p=list_sqrs..p.value.,n=df_sqrs.n_sqrs,estimate=df_sqrs.Estimate,ci_low=df_sqrs.CI_l,ci_up=df_sqrs.CI_u)
matrix_ps<-rbind(matrix_cent,matrix_dist,matrix_rear,matrix_sqrs,matrix_toa,matrix_eca,matrix_eoa,matrix_trans,matrix_dark,matrix_light,matrix_stps,matrix_rrs,matrix_time,matrix_toc,matrix_cross,matrix_hd,matrix_lat)
We don't have your data but you can put all your code in a function :
get_result <- function(x, y) {
list_cent<-coef(summary(x))
list_cent<-setNames(cbind(rownames(list_cent), list_cent, row.names = NULL),
c("Drug", "Estimate", "se","zval","p-value","CI_l","CI_u"))
df_cent <- list_cent[ -c(3,4) ]
df_cent$Drug<-gsub("factor*","",df_cent$Drug)
df_cent$Drug<-gsub("drug*","",df_cent$Drug)
df_cent$Drug<-gsub("[[:punct:]]","",df_cent$Drug)
n_cent<-plyr::count(cent_sum2, vars = "drug")
names(n_cent)[names(n_cent) == "freq"] <- y
df_cent<-cbind(df_cent,n_cent[2])
return(df_cent)
}
Now assuming all your analyses follow the pattern 'res_' you can do :
library(purrr)
list_models <- mget(ls(pattern = 'res_'))
result <- imap(list_models, get_result) %>% reduce(inner_join)
I'm trying to access individual elements of an object returned from TukeyHSD function. I can see how to access the individual elements, but how do I access the labels?
$`Auto$CargoSpace`
diff lwr upr p adj
wagon-SUV -3747.333 -7664.507 -980.0801 6.855348e-03
trunk-SUV -4792.333 -5621.311 -2371.3357 2.065806e-05
trunk-wagon -968.000 -3823.523 2125.54328 7.410039e-01
I'd like to be able to access each row/column combination the way I can with a tibble or dataframe. That way I can add an interpretation later in the code. Let's say I wanted to end up with this result:
CargoSpace p adj Analysis
wagon-SUV 6.85 Unlikely to produce a benefit
trunk-SUV 2.06 Worth investigating
We extract the output with $, then use either grep (if partial matches) or %in% (for fixed matching) to subset the rows, create a data.frame with the row names of the subset of the dataset along with "p adj" column. Then, we can create 'Analysis' column based on the value of 'p.adj'
out1 <- out$`Auto$CargoSpace`
out2 <- out1[grep("SUV$", row.names(out1),]
out3 <- data.frame(CargoSpace = row.names(out2), p.adj = out2[, "p adj"])
out3$Analysis <- ifelse(out2$p.adj < 0.0001, "Worth investigating", "Unlikely to produce a benefit")
Reproducible example
fm1 <- aov(breaks ~ wool + tension, data = warpbreaks)
out <- TukeyHSD(fm1, "tension", ordered = TRUE)
out$tension
# diff lwr upr p adj
#M-H 4.722222 -4.6311985 14.07564 0.447421021
#L-H 14.722222 5.3688015 24.07564 0.001121788
#L-M 10.000000 0.6465793 19.35342 0.033626219
I have a dataset of 25 variables and 248 rows.
There are 8-factor variables and the rest are integers and numbers.
I am trying to run XGBoost.
I have done the following code: -
# Partition Data
set.seed(1234)
ind <- sample(2, nrow(mission), replace = T, prob = c(0.7,0.3))
train <- mission[ind == 1,]
test <- mission[ind == 2,]
# Create matrix - One-Hot Encoding for Factor variables
trainm <- sparse.model.matrix(GRL ~ .-1, data = train)
head(trainm)
train_label <- train[,"GRL"]
train_matrix <- xgb.DMatrix(data = as.matrix(trainm), label = train_label)
testm <- sparse.model.matrix(GRL~.-1, data = test)
test_label <- test[,"GRL"]
test_matrix <- xgb.DMatrix(data = as.matrix(testm),label = test_label)
The response variable here is "GRL" and I am running the test_label <- test[,"GRL"]
The above code is getting executed but when I am trying to use it in xgb.DMatrix, I am encountering the following error:
Error in setinfo.xgb.DMatrix(dmat, names(p), p[[1]]) :
The length of labels must equal to the number of rows in the input data
I have partitioned the data into 70:30.
test[,"GRL"] returns a data.frame, and XGBoost needs the label to be a vector.
Just use teste$GRL or test[["GRL"]] instead. You also need to do the same for the training dataset
I have a 2 different data frames for which i would like to perform linear regression
I have written following code for it
mydir<- "/media/dev/Daten/Task1/subject1/t1"
#multiple subject paths should be given here
# read full paths
myfiles<- list.files(mydir,pattern = "regional_vol*",full.names=T)
# initialise the dataframe from first file
df<- read.table( myfiles[1], header = F,row.names = NULL, skip = 3, nrows = 1,sep = "\t")
# [-c(1:3),]
df
#read all the other files and update dataframe
#we read 4 lines to read the header correctly, then remove 3
ans<- lapply(myfiles[-1], function(x){ read.table( x, header = F, skip = 3, nrows = 1,sep = "\t") })
ans
#update dataframe
#[-c(1:3),]
lapply(ans, function(x){df<<-rbind(df,x)} )
#this should be the required dataframe
uncorrect<- array(df)
# Linear regression of ICV extracted from global size FSL
# Location where your icv is located
ICVdir <- "/media/dev/Daten/Task1/T1_Images"
#loding csv file from ICV
mycsv <- list.files(ICVdir,pattern = "*.csv",full.names = T )
af<- read.csv(file = mycsv,header = TRUE)
ICV<- as.data.frame(af[,2],drop=FALSE)
#af[1,]
#we take into consideration second column of csv
#finalcsv <-lapply(mycsv[-1],fudnction(x){read.csv(file="global_size_FSL")})
subj1<- as.data.frame(rep(0.824,each=304))
plot(df ~ subj1, data = df,
xlab = "ICV value of each subject",
ylab = "Original uncorrected volume",
main="intercept calculation"
)
fit <- lm(subj1 ~ df )
The data frame df has 304 values in following format
6433 6433
1430 1430
1941 1941
3059 3059
3932 3932
6851 6851
and another data frame Subj1 has 304 values in following format
0.824
0.824
0.824
0.824
0.824
When i run my code i am incurring following error
Error in model.frame.default(formula = subj1 ~ df, drop.unused.levels = TRUE) :
invalid type (list) for variable 'subj1'
any suggestions why the data.frame values from variable subj1 are invalid
As mentioned, you are trying to give a data.frame as an independent variable. Try:
fit <- lm(subj1 ~ ., data=df )
This will use all variables in the data frame, as long as subj1 is the dependent variable's name, and not a data frame by itself.
If df has two columns which are the predictors, and subj1 is the predicted (dependent) variable, combing the two, give them proper column names, and create the model in the format above.
Something like:
data <- cbind(df, subj1)
names(data) <- c("var1", "var2", "subj1")
fit <- lm(subj1 ~ var1 + var2, data=df )
Edit: some pointers:
make sure you use a single data frame that holds all of your independent variables, and your dependent variable.
The number of rows should be equal.
If an independent variable in a constant, it has no variance for different values of the dependent variable, and so will have no meaning. If the dependent variable is a constant, there is no point for regressing - we can predict the value with 100% accuracy.
I have the famous titanic data set from Kaggle's website. I want to predict the survival of the passengers using logistic regression. I am using the glm() function in R. I first divide my data frame(total rows = 891) into two data frames i.e. train(from row 1 to 800) and test(from row 801 to 891).
The code is as follows
`
>> data <- read.csv("train.csv", stringsAsFactors = FALSE)
>> names(data)
`[1] "PassengerId" "Survived" "Pclass" "Name" "Sex" "Age" "SibSp"
[8] "Parch" "Ticket" "Fare" "Cabin" "Embarked" `
#Replacing NA values in Age column with mean value of non NA values of Age.
>> data$Age[is.na(data$Age)] <- mean(data$Age, na.rm = TRUE)
#Converting sex into binary values. 1 for males and 0 for females.
>> sexcode <- ifelse(data$Sex == "male",1,0)
#dividing data into train and test data frames
>> train <- data[1:800,]
>> test <- data[801:891,]
#setting up the model using glm()
>> model <- glm(Survived~sexcode[1:800]+Age+Pclass+Fare,family=binomial(link='logit'),data=train, control = list(maxit = 50))
#creating a data frame
>> newtest <- data.frame(sexcode[801:891],test$Age,test$Pclass,test$Fare)
>> prediction <- predict(model,newdata = newtest,type='response')
`
And as I run the last line of code
prediction <- predict(model,newdata = newtest,type='response')
I get the following error
Error in eval(expr, envir, enclos) : object 'Age' not found
Can anyone please explain what the problem is. I have checked the newteset variable and there doesn't seem to be any problem in that.
Here is the link to titanic data set https://www.kaggle.com/c/titanic/download/train.csv
First, you should add the sexcode directly to the dataframe:
data$sexcode <- ifelse(data$Sex == "male",1,0)
Then, as I commented, you have a problem in your columns names in the newtest dataframe because you create it manually. You can use directly the test dataframe.
So here is your full working code:
data <- read.csv("train.csv", stringsAsFactors = FALSE)
data$Age[is.na(data$Age)] <- mean(data$Age, na.rm = TRUE)
data$sexcode <- ifelse(data$Sex == "male",1,0)
train <- data[1:800,]
test <- data[801:891,]
model <- glm(Survived~sexcode+Age+Pclass+Fare,family=binomial(link='logit'),data=train, control = list(maxit = 50))
prediction <- predict(model,newdata = test,type='response')