I'm writing a function to get some descriptive stats from a data frame. The function takes three argument: data set, set of numerical variables, set of character variables. I managed to write the function to successfully obtain the required result when the both numerical and character variables are identified within the argument. However, when one of these argument is missing, I'd like the function to return a list with two components with the missing argument as NULL within its component.
Here's the code I've written. Please let me know if you have an answer.
table1 <- function(dat, numvar, charvar){
result_n <- numeric()
result_c <- data.frame()
#This is the original table function for numerical values
for (i in 1:length(numvar)) {
new_row <- c(round(mean(dat[[numvar[i]]],na.rm = T),2) ,
round(median(dat[[numvar[i]]],na.rm = T),2),
round(sd(dat[[numvar[i]]],na.rm = T),2),
length(dat[[numvar[i]]])-sum(is.na(dat[[numvar[i]]])),
sum(is.na(dat[[numvar[i]]])))
result_n <- rbind(result_n,new_row)
}
rownames(result_n) <- numvar
colnames(result_n) <- c("Mean", "Median", "SD", "N", "N_miss")
#Thisi is the new table for char values
for (i in 1:length(charvar)) {
tab.dat <- as.data.frame(table(dat[charvar[i]],useNA = "ifany" ))
a1 <- as.character(tab.dat$Var1)
a1[3] <- "NMiss"
one.table <- data.frame(
Varname = c(charvar[i], rep(" ", nrow(tab.dat)-1)),
group = a1,
count= tab.dat$Freq)
result_c <- rbind(result_c, one.table)
}
result_list <- list(numericStats = result_n, FactorStats =result_c)
return(result_list)
}
You can set the default value to a function:
table1 <- function(dat = NULL, numvar = NULL, charvar = NULL) {...
From there, the script can determine which is missing and go from there.
Here's the answer:
table1 <- function(dat, numvar=NULL, charvar=NULL){
result_n <- numeric()
result_c <- data.frame()
#This is the original table function for numerical values
#I borrowed builtin function (ifmissing) from the internet
if(!missing(numvar)) {for (i in 1:length(numvar)) {
new_row <- c(round(mean(dat[[numvar[i]]],na.rm = T),2) ,
round(median(dat[[numvar[i]]],na.rm = T),2),
round(sd(dat[[numvar[i]]],na.rm = T),2),
length(dat[[numvar[i]]])-sum(is.na(dat[[numvar[i]]])),
sum(is.na(dat[[numvar[i]]])))
result_n <- rbind(result_n,new_row)
}
rownames(result_n) <- numvar
colnames(result_n) <- c("Mean", "Median", "SD", "N", "N_miss")}
#Thisi is the new table for char values
#I borrowed builtin function (ifmissing) from the internet
if(!missing(charvar)) {for (i in 1:length(charvar)) {
tab.dat <- as.data.frame(table(dat[charvar[i]],useNA = "ifany" ))
a1 <- as.character(tab.dat$Var1)
a1[3] <- "NMiss"
one.table <- data.frame(
Varname = c(charvar[i], rep(" ", nrow(tab.dat)-1)),
group = a1,
count= tab.dat$Freq)
result_c <- rbind(result_c, one.table)
}}
result_list <- list(numericStats = result_n, FactorStats =result_c)
return(result_list)
}
Related
I have an output of RNA-seq reads from CLC genomics workbench, for Arabidopsis thaliana. The list of genes contains a mix of gene names (i.e. "TRY", "TMM", "SVP", "FLC"), and IDs (e.g. "AT1G01390", "AT1G01310", "AT1G01240"). I would like to convert them all to gene names, so I can run it through a GO terms R package (the package seemingly does not read IDs like AT1G01390).
When I use biomaRt's getBM() function, it returns a lot less genes than the list of genes I'm reading into it. The original list from CLC has all Arabidopsis genes (27,655) and the outputs from getBM() generally have 12,085 gene names or less.
Anybody done this type of conversion before with success?
Thanks in advance!
I've tried various types of attributes, but none of them have worked.
#data load in and conversions, meta matrix/design creation:
#reads file was created in CLC Genomics Workbench, then the reads column copied and pasted for
#each sample
reads <- as.matrix(read.csv("genereads_ONLY4.txt", sep = '\t', row.names = 1, header = TRUE))
meta <- read.table("metatest4.txt", header = TRUE, fileEncoding= "UTF-16LE")
mart = useMart(biomart="plants_mart",host="plants.ensembl.org")
listDatasets(useMart(biomart="plants_mart",host="plants.ensembl.org"))
ensembl = useDataset("athaliana_eg_gene",mart= mart)
genes <- row.names(reads)
test1 <- getBM(attributes='external_gene_name',
values = genes,
mart = ensembl)
Okay, I found a round about way to solve this, at least for my scenario.
The gmt and fgsea information I'm using can only read gene symbols (e.g. "TRY") or entrez IDs. So I wrote a function to convert all of the information I had to either symbols or entrez IDs. The code is:
reads <- as.matrix(read.csv("genereads_ONLY4.txt", sep = '\t', row.names = 1, header = TRUE))
genes <- row.names(reads)
sum(lengths(regmatches(genes, gregexpr("\\AT[0-9]", genes, ignore.case = TRUE))))
#genes <- c("TRY", "AT2G46410", "AT5G41315", "AT2G42200", "AT1G10280")
IDconvert <- function(genes) {
for (i in genes){
if (grepl("AT[0-9]", i) == TRUE) {
if (is.na(getSYMBOL(i, data='org.At.tair.db')) == TRUE) {
if (is.na(getEG(i, data='org.At.tair')) == TRUE) {
i <- i
} else{
name <- getEG(i, data='org.At.tair')
name.l <- as.list(name)
newname <- as.character(name.l[[1]])
genes <- sub(i, newname, genes)
}
} else{
name <- getSYMBOL(i, data='org.At.tair')
name.l <- as.list(name)
newname <- as.character(name.l[[1]])
genes <- sub(i, newname, genes)
}
} else{
NULL
}
}
return(genes)
}
genes2 <- IDconvert(genes)
sum(lengths(regmatches(genes2, gregexpr("\\AT[0-9]", genes2, ignore.case = TRUE))))
row.names(reads) <- genes2
gmt <- read.gmt("GSEA_BIO.gmt")
gmt.ids <- read.gmt("IB_BIO_GMT.gmt")
gmt.combo <- c(gmt, gmt.ids)
#Stage 3 GO terms
names3 <- row.names(sub.break3)
sub.break3$names=names3
ranks <- sub.break3$stat
names(ranks) <- sub.break3$names
sub.break3.rank <- sort(ranks, decreasing = T)
fgseaRes3 <- fgsea(pathways = gmt.combo,
stats = sub.break3.rank,
minSize=5,
maxSize=500,
nperm=100000)
fgsea3.sig <- fgseaRes3[pval < 0.05]
pathways.stg3 <- fgsea3.sig$pathway
#Stage 1 GO terms
names1 <- row.names(sub.break1)
sub.break1$names=names1
ranks <- sub.break1$stat
names(ranks) <- sub.break1$names
sub.break1.rank <- sort(ranks, decreasing = T)
fgseaRes1 <- fgsea(pathways = gmt.combo,
stats = sub.break1.rank,
minSize=5,
maxSize=500,
nperm=100000)
fgsea1.sig <- fgseaRes1[pval < 0.05]
pathways.stg1 <- fgsea1.sig$pathway
#Stage 2 GO terms
names2 <- row.names(sub.break2)
sub.break2$names=names2
ranks <- sub.break2$stat
names(ranks) <- sub.break2$names
sub.break2.rank <- sort(ranks, decreasing = T)
fgseaRes2 <- fgsea(pathways = gmt.combo,
stats = sub.break2.rank,
minSize=5,
maxSize=500,
nperm=100000)
fgsea2.sig <- fgseaRes2[pval < 0.05]
pathways.stg2 <- fgsea2.sig$pathway
#Stage 4 GO terms
names4 <- row.names(sub.break4)
sub.break4$names=names4
ranks <- sub.break4$stat
names(ranks) <- sub.break4$names
sub.break4.rank <- sort(ranks, decreasing = T)
fgseaRes4 <- fgsea(pathways = gmt.combo,
stats = sub.break4.rank,
minSize=5,
maxSize=500,
nperm=100000)
fgsea4.sig <- fgseaRes4[pval < 0.05]
pathways.stg4 <- fgsea4.sig$pathway
#openxlsx::write.xlsx(fgsea4.sig, "fgsea_stg4_t1.xlsx")
#GO Venn-----------------------------------
group.venn(list(One = pathways.stg1,
Two = pathways.stg2,
Three = pathways.stg3,
Four = pathways.stg4),
fill = c("orange", "green", "red", "blue"))
I am trying to do a generic function to construct a formula for lineal regression. I want that the function create the formula either
using user defined variables or,
using all the variables present in the dataframe.
I can create the formula using all the variables present in the dataframe but my problem is when I try to get the user defined variables, I do not know exactly how to get the variables to later use them to create the formula.
The function that I have until now is this:
lmformula <- function (data, IndepVariable = character, VariableList = TRUE){
if (VariableList) {
newlist <- list()
newlist <- # Here is where I do not exactly what to do to extract the variables defined by user
DependVariables <- newlist
f <- as.formula(paste(IndepVariable, "~", paste((DependVariables), collapse = '+')))
}else {
names(data) <- make.names(colnames(data))
DependVariables <- names(data)[!colnames(data)%in% IndepVariable]
f <- as.formula(paste(IndepVariable,"~", paste((DependVariables), collapse = '+')))
return (f)
}
}
Please any hint will be deeply appreciated
The only thing that changes is how you get the independent variables
If the user specifies them, then use that character vector directly
Else, you have to to take all the variables other than the dependent variable(which you are already doing)
Note : As Roland mentioned, the formula is like dependentVariable ~ independentVariable1 + independentVariable2 + independentVariable3
# creating mock data
data <- data.frame(col1 = numeric(0), col2 = numeric(0), col3 = numeric(0), col4 = numeric(0))
# the function
lmformula <- function (data, DepVariable, IndepVariable, VariableList = TRUE) {
if (!VariableList) {
IndepVariable <- names(data)[!names(data) %in% DepVariable]
}
f <- as.formula(paste(DepVariable,"~", paste(IndepVariable, collapse = '+')))
return (f)
}
# working examples
lmformula(data = data, DepVariable = "col1", VariableList = FALSE)
lmformula(data = data, DepVariable = "col1", IndepVariable = c("col2", "col3"), VariableList = TRUE)
Hope it helps!
I have to admit I am new to coding functions, wherefore I need your help.
This code shall provide a Bayesian criterion (pBIC) following an ANOVA and automatically read the necessary information from the ANOVA table.
I have two functions
## This is function 1
test_pBIC1 <- function(name,c){ ## name is the name of the ANOVA table, e.g. "ANOVA_ALL_wake" and c is the number of conditions
c = c
data = get(name)
i = length(data$ANOVA$Effect)
result1 = data.frame(name,c,i)
return(result1)
}
## ----------------------------------------------------
## I now run and save the result of Function 1
result1 <- test_pBIC1("ANOVA_ALL_wake",3) ## for test
## ----------------------------------------------------
## This is function 2
test_pBIC2 <- function(result1){
name1 <- as.character(result1$name)
data = get(name1)
count <- as.vector(result1$i)
for (i in 1:count){
s = (data$ANOVA$DFd[i]/data$ANOVA$DFn[i])+1
n = s*(result1[2]-1)
SSE1 = data$ANOVA$SSd[i]
SSE0 = data$ANOVA$SSd[i]+data$ANOVA$SSn[i]
deltaBIC = (n * log(SSE1/SSE0))+(data$ANOVA$DFn[i]*log(n))
BF01 = exp(deltaBIC/2)
pH0_D = (BF01/(1+BF01))
pH1_D = (1-pH0_D)
result = data.frame(pH0_D, pH1_D)
colnames(result) <- c("pH0_D", "pH1_D")
rownames(result) <- c(data$ANOVA$Effect[i])
if (i == 1){
result_all <- result
} else {
result_all <- rbind (result_all, result)
}
}
return(result_all)
}
## ------------------------------------------------------
Now I run function 2 and receive the result
test_pBIC2(result1)
Now while this does it's job, I would like to link the two functions so I just have to give the name and the parameter c and still get result_all in the end, i.e. without having to run the two functions after each other.
I have tried to come up with this solution:
test_pBIC <- function(name,c){ ## pass arguments as: test_pBIC(name = "ANOVA_all_wake", c = 3)
c = c
name = name
result1 = data.frame(name,c)
# return(result1)
test_pBIC1 <- function(result1){
c = as.vector(result1$c)
name1 <- as.character(result1$name)
data = get(name)
i = length(data$ANOVA$Effect)
result2 = data.frame(name,c,i)
# return(result2)
test_pBIC2 <- function(result2){
name1 <- as.character(result2$name)
data = get(name1)
count <- as.numeric(integer$i)
for (i in 1:count){
s = (data$ANOVA$DFd[i]/data$ANOVA$DFn[i])+1
n = s*(result1[2]-1)
SSE1 = data$ANOVA$SSd[i]
SSE0 = data$ANOVA$SSd[i]+data$ANOVA$SSn[i]
deltaBIC = (n * log(SSE1/SSE0))+(data$ANOVA$DFn[i]*log(n))
BF01 = exp(deltaBIC/2)
pH0_D = (BF01/(1+BF01))
pH1_D = (1-pH0_D)
result = data.frame(pH0_D, pH1_D)
colnames(result) <- c("pH0_D", "pH1_D")
rownames(result) <- c(data$ANOVA$Effect[i])
if (i == 1){
result_all <- result
} else {
result_all <- rbind (result_all, result)
}
}
return(result_all)
}
}
}
test_pBIC("ANOVA_all_wake", 3)
However, I just get NOTHING...and I cannot find the mistake :(.
Thanks!!
Not entirely sure what the issue is, a reproducible example would help a lot. If you want to just combine it into one function you could do...
test_overall <- function(name,c) {
c = c
data = get(name)
i = length(data$ANOVA$Effect)
result1 = data.frame(name,c,i)
name1 <- as.character(result1$name)
data = get(name1)
count <- as.vector(result1$i)
for (i in 1:count){
s = (data$ANOVA$DFd[i]/data$ANOVA$DFn[i])+1
n = s*(result1[2]-1)
SSE1 = data$ANOVA$SSd[i]
SSE0 = data$ANOVA$SSd[i]+data$ANOVA$SSn[i]
deltaBIC = (n * log(SSE1/SSE0))+(data$ANOVA$DFn[i]*log(n))
BF01 = exp(deltaBIC/2)
pH0_D = (BF01/(1+BF01))
pH1_D = (1-pH0_D)
result = data.frame(pH0_D, pH1_D)
colnames(result) <- c("pH0_D", "pH1_D")
rownames(result) <- c(data$ANOVA$Effect[i])
if (i == 1){
result_all <- result
} else {
result_all <- rbind (result_all, result)
}
}
return(result_all)
}
In your first code example you've created functions test_pBIC1 and test_pBIC2. If you want to create a function test_pBIC that calls both, you can just define a function that calls both:
test_pBIC <- function(name, c) test_pBIC2(test_pBIC1(name, c))
I just discovered the power of plyr frequency table with several variables in R
and I am still struggling to understand how it works and I hope some here can help me.
I would like to create a table (data frame) in which I can combine frequencies and summary stats but without hard-coding the values.
Here an example dataset
require(datasets)
d1 <- sleep
# I classify the variable extra to calculate the frequencies
extraClassified <- cut(d1$extra, breaks = 3, labels = c('low', 'medium', 'high') )
d1 <- data.frame(d1, extraClassified)
The results I am looking for should look like that :
require(plyr)
ddply(d1, "group", summarise,
All = length(ID),
nLow = sum(extraClassified == "low"),
nMedium = sum(extraClassified == "medium"),
nHigh = sum(extraClassified == "high"),
PctLow = round(sum(extraClassified == "low")/ length(ID), digits = 1),
PctMedium = round(sum(extraClassified == "medium")/ length(ID), digits = 1),
PctHigh = round(sum(extraClassified == "high")/ length(ID), digits = 1),
xmean = round(mean(extra), digits = 1),
xsd = round(sd(extra), digits = 1))
My question: how can I do this without hard-coding the values?
For the records:
I tried this code, but it does not work
ddply (d1, "group",
function(i) c(table(i$extraClassified),
prop.table(as.character(i$extraClassified))),
)
Thanks in advance
Here's an example to get you started:
foo <- function(x,colfac,colval){
tbl <- table(x[,colfac])
res <- cbind(n = nrow(x),t(tbl),t(prop.table(tbl)))
colnames(res)[5:7] <- paste(colnames(res)[5:7],"Pct",sep = "")
res <- as.data.frame(res)
res$mn <- mean(x[,colval])
res$sd <- sd(x[,colval])
res
}
ddply(d1,.(group),foo,colfac = "extraClassified",colval = "extra")
Don't take anything in that function foo as gospel. I just wrote that off the top of my head. Surely improvements/modifications are possible, but at least it's something to start with.
Thanks to Joran.
I slighlty modified your function to make it more generic (without reference to the position of the variables) .
require(plyr)
foo <- function(x,colfac,colval)
{
# table with frequencies
tbl <- table(x[,colfac])
# table with percentages
tblpct <- t(prop.table(tbl))
colnames( tblpct) <- paste(colnames(t(tbl)), 'Pct', sep = '')
# put the first part together
res <- cbind(n = nrow(x), t(tbl), tblpct)
res <- as.data.frame(res)
# add summary statistics
res$mn <- mean(x[,colval])
res$sd <- sd(x[,colval])
res
}
ddply(d1,.(group),foo,colfac = "extraClassified",colval = "extra")
and it works !!!
P.S : I still do not understand what (group) stands for but
I want to visualize a mosaic plot in form of a tree. For example
mosaicplot(~ Sex + Age + Survived, data = Titanic, color = TRUE)
Now what I want is to represent this in a tree form where the first node
for example be sex the second node be age and at the terminal node be number of people survived. May be it should something like http://addictedtor.free.fr/graphiques/RGraphGallery.php?graph=84 where instead of p giving the number of counts.
Is there an function in R to do this or should I write it on my own by taking at a look
at the party:::plot.BinaryTree function
Here is how I managed to get what I wanted with the lovely igraph package. The code is an ugly hack. It will be great to have you suggestions
library(igraph)
rm(list=ls())
req.data <- as.data.frame(Titanic)
lookup <- c("M","F","C","A","N","Y")
names(lookup) <- c("Male","Female","Child","Adult","Yes","No")
req.data$board <- "board"
req.data$Class.m <- paste(req.data$board,req.data$Class,sep="_")
req.data$Sex.m <- paste(req.data$board,req.data$Class,req.data$Sex,
sep="_")
req.data$Age.m <- paste(req.data$board,req.data$Class,req.data$Sex,
req.data$Age,sep="_")
req.data$Survived.m <- paste(req.data$board,req.data$Class,req.data$Sex,
req.data$Age,req.data$Survived,sep="_")
tmp <- data.frame(from=
do.call("c",lapply(req.data[,c("board",
"Class.m",
"Sex.m",
"Age.m")],as.character)),
to=do.call("c",lapply(req.data[,c("Class.m",
"Sex.m",
"Age.m",
"Survived.m")],as.character)),
stringsAsFactors=FALSE)
tmp <- tmp [!duplicated(tmp ),];rownames(tmp) <- NULL
tmp$num <- unlist(lapply(strsplit(tmp$to,"_"),
FUN=function(x){
check1 <- req.data$Class==x[2]
check2 <- req.data$Sex == x[3]
check3 <- req.data$Age == x[4]
check4 <- req.data$Survived == x[5]
sum(req.data$Freq[ifelse(is.na(check1),TRUE,check1) &
ifelse(is.na(check2),TRUE,check2) &
ifelse(is.na(check3),TRUE,check3) &
ifelse(is.na(check4),TRUE,check4)])}))
g <- graph.data.frame(tmp, directed=TRUE)
V(g)$label <- unlist(lapply(strsplit(V(g)$name,"_"),
FUN=function(y){ifelse(y[length(y)] %in% names(lookup),
lookup[y[length(y)]],y[length(y)])}))
E(g)$label <- tmp$num
plot(g,layout=layout.reingold.tilford,ylim=c(1,-1),edge.arrow.size=0.5,vertex.size=7)
legend("topleft", paste(lookup ,names(lookup),sep=" : "),ncol=2,bty="n",cex=0.7)
### To find the case for crew members
tmp1 <- tmp [grepl("Crew",tmp$from),];rownames(tmp1) <- NULL
g <- graph.data.frame(tmp1, directed=TRUE)
V(g)$label <- unlist(lapply(strsplit(V(g)$name,"_"),
FUN=function(y){ifelse(y[length(y)] %in% names(lookup),
lookup[y[length(y)]],y[length(y)])}))
E(g)$label <- tmp1$num
plot(g,layout=layout.reingold.tilford,ylim=c(1,-1),edge.arrow.size=0.5)
legend("topleft", paste(lookup ,names(lookup),sep=" : "),ncol=2,bty="n",cex=0.7)
Here is the plot I generate. You can modify the vertex/edge colors/size as you want
This is pretty close and looks a lot easier to me.. I post it here in case it may be of use. First I convert the ftable to a more traditional long data frame using expand.dft https://stat.ethz.ch/pipermail/r-help/2009-January/185561.html Then I just use the plot.dendrite function from the plotrix package.
expand.dft <- function(x, var.names = NULL, freq = "Freq", ...)
{
# allow: a table object, or a data frame in frequency form
if(inherits(x, "table"))
x <- as.data.frame.table(x, responseName = freq)
freq.col <- which(colnames(x) == freq)
if (length(freq.col) == 0)
stop(paste(sQuote("freq"), "not found in column names"))
DF <- sapply(1:nrow(x),
function(i) x[rep(i, each = x[i, freq.col]), ],
simplify = FALSE)
DF <- do.call("rbind", DF)[, -freq.col]
for (i in 1:ncol(DF))
{
DF[[i]] <- type.convert(as.character(DF[[i]]), ...)
}
rownames(DF) <- NULL
if (!is.null(var.names))
{
if (length(var.names) < dim(DF)[2])
{
stop(paste("Too few", sQuote("var.names"), "given."))
} else if (length(var.names) > dim(DF)[2]) {
stop(paste("Too many", sQuote("var.names"), "given."))
} else {
names(DF) <- var.names
}
}
DF
}
library(plotrix)
r = ftable(Titanic)
plot.dendrite(makeDendrite(expand.dft(data.frame(r))))