Combine gtables with separate headings within loop - r

I have data on students exam results. Each student takes a 100 question exam. Each question has been allocated (unequally) to one of 5 domains and also has a unique learning outcome associated with it. I need to summarise the data in a table (likely spread over multiple pdf pages), identifying for each domain, the learning outcomes the student got correct/incorrect.
For each Domain I have managed to get a table which shows the the learning outcomes the student got correct/incorrect, but I can't figure out how to join the tables together after the loop. I'd really appreciate some help with this...
Please find below a copy of my code:
library(tidyverse)
library(dplyr)
library(gridExtra)
library(grid)
library(gtable)
library(ids)
# Create some random Learning Objectives:
LO <- ids::adjective_animal(2500, 2, style = "sentence")
# Create data:
testdata <- data.frame(cbind(StudentID=rep(1:25,each=100),
QuestionNumber=rep(1:100),
Correct=sample(c(0,1),replace=TRUE,size=2500),
Domain=c("A","A","A","B","C","C","C","D","D","E"),
LearningOutcome=LO))
rm(LO)
# Create lists for Loops:
DomainList <- unique(testdata$Domain)
StudentList <- unique(testdata$StudentID)
Domain.Colours <- list("steelblue3","red2","violetred3","forestgreen","chocolate1")
for (i in 1:length(unique(testdata$StudentID))){
for (j in 1:length(DomainList)){
# Select Domain Specfic Data
TEMP.DatabyDomain <- testdata[testdata$Domain==DomainList[j],]
# Split Learning Outcomes into Correct/Incorrect Lists
TEMP.Correct <- TEMP.DatabyDomain$LearningOutcome[TEMP.DatabyDomain$StudentID == StudentList[i] & TEMP.DatabyDomain$Correct == 1]
TEMP.Incorrect <- TEMP.DatabyDomain$LearningOutcome[TEMP.DatabyDomain$StudentID == StudentList[i] & TEMP.DatabyDomain$Correct == 0]
# Update the lengths to be the same
n <- max(length(TEMP.Correct), length(TEMP.Incorrect))
length(TEMP.Correct) <- n
length(TEMP.Incorrect) <- n
# Combine the data into a new df
FeedbackData <- data.frame(TEMP.Correct, TEMP.Incorrect)
FeedbackData <- sapply(FeedbackData, as.character)
# Replace NAs with ""
FeedbackData[is.na(FeedbackData)] <- " "
# Create column headings
colnames(FeedbackData) <- c(paste("Correctly answered questions\n in domain:",DomainList[j]),paste("Incorrectly answered questions\n in domain:",DomainList[j]))
# Table Settings
tt1 <- ttheme_default(core=list(fg_params=list(fontsize=8)),
rowhead=list(fg_params=list(hjust=0, x=0)),
colhead=list(bg_params=list(fill=paste(Domain.Colours[j])),
fg_params=list(col="white")))
# Table Results
tfeedback <- tableGrob(FeedbackData, theme=tt1, rows = NULL)
}
### Here is where I get stuck ###
# I would like to append the 5 Domain tables together, so that they are one long table with the column headings in-between
tfeedbacktitle <- textGrob(paste("Feedback for Students"),gp=gpar(fontsize=20))
padding <- unit(10,"mm")
table <- gtable_add_rows(
tfeedback,
heights = grobHeight(tfeedbacktitle) + padding,
pos = 0)
table <- gtable_add_grob(
table,
tfeedbacktitle,
1, 1, 1, ncol(table))
# Code below to allow table to cover multiple pages
# Taken from <https://stackoverflow.com/questions/15937131/print-to-pdf-file-using-grid-table-in-r-too-many-rows-to-fit-on-one-page>
fullheight <- convertHeight(sum(table$heights), "cm", valueOnly = TRUE)
margin <- unit(0,"in")
margin_cm <- convertHeight(margin, "cm", valueOnly = TRUE)
a4height <- 29.7 - margin_cm
nrows <- nrow(table)
npages <- ceiling(fullheight / a4height)
heights <- convertHeight(table$heights, "cm", valueOnly = TRUE)
rows <- cut(cumsum(heights), include.lowest = FALSE,
breaks = c(0, cumsum(rep(a4height, npages))))
groups <- split(seq_len(nrows), rows)
gl1 <- lapply(groups, function(id) table[id,])
mypath <- file.path(pathOutput,paste("StudentID_", StudentList[i], ".pdf", sep = ""))
pdf(file=mypath, paper = "a4", width = 0, height = 0)
for(page in seq_len(npages)){
grid.newpage()
grid.rect(width=unit(21,"cm") - margin,
height=unit(29.7,"cm")- margin)
grid.draw(gl1[[page]])
}
dev.off()
}
The link shows a picture of what I hope to achieve: Example Final Result

Related

How to create a table from a data.frame where a cell could have multiple values using R

I have tried looking at cheatsheets and looking over other questions asked on here but have been unsuccessful in finding an answer.
I am using R and My data.frame looks like this:
I want to take the second column and make it the vertical categories and make the third column the horizontal categories. The first column would then be matched to the corresponding categories in its row.
Here is an example of how I want to format the table:
Is there a way write a code to do this in order to avoid using Excel and Word to create the table?
Try this:
df = data.frame(let = LETTERS[1:12],
vert = c(10, 10, 2.5, 5, 10, 5, 2.5, 10, 1.25, 1.25, 1.25, 1.25),
hor = c(2,2,3,2,4,2,3,4,1,4,4,1),
stringsAsFactors = F)
# find unique combinations
positions = expand.grid(unique(df$vert), unique(df$hor))
# pre-allocate matrix
M = matrix(ncol = length(unique(df$hor)),
nrow = length(unique(df$vert)))
rownames(M) <- sort(unique(df$vert))
colnames(M) <- sort(unique(df$hor))
# loop over valid positions and put them in the matrix
for (i in c(1:nrow(positions))){
# get row
row = as.numeric(positions[i,])
# gather all entries that go in position
valid = df[df$vert == row[1] & df$hor == row[2], 'let']
valid = paste(valid, collapse=",")
# get matrix indices
vert_i <- which(rownames(M) == row[1])
horiz_i <- which(colnames(M) == row[2])
# put the data in the matrix
M[vert_i, horiz_i] <- valid
}
print(M)
It could be more efficient but it gets the job done.
Here is one way:
library(tidyr)
library(dplyr)
# example data
exd <- data.frame(
Vertical = c(10,10,2.5,5,10,5,2.5,10,1.25,1.25,1.25, 1.25),
Horizontal = c(2,2,3,2,4,2,3,4,1,4,4,1),
row.names = LETTERS[1:12])
# move values into a column
exd <- mutate(exd,
Value = rownames(exd))
# aggregate by Vertical and Horizontal
exd <- summarize(group_by(exd, Vertical, Horizontal),
Value = paste(Value, collapse = ","))
# re-arrange into the desired form
spread(exd, Horizontal, Value)

Create a matrix from a list consisting of unequal matrices for individual bootstraps

I tried to create a matrix from a list which consists of N unequal matrices...
The reason to do this is to make R individual bootstrap samples.
In the example below you can find e.g. 2 companies, where we have 1 with 10 & 1 with just 5 observations.
Data:
set.seed(7)
Time <- c(10,5)
xv <- matrix(c(rnorm(10,5,2), rnorm(5,20,1), rnorm(10,5,2), rnorm(5,20,1)), ncol=2);
y <- matrix( c(rnorm(10,5,2), rnorm(5,20,1)));
z <- matrix(c(rnorm(10,5,2), rnorm(5,20,1), rnorm(10,5,2), rnorm(5,20,1)), ncol=2)
# create data frame of input variables which helps
# to conduct the rowise bootstrapping
data <- data.frame (y = y, xv = xv, z = z);
rows <- dim(data)[1];
cols <- dim(data)[2];
# create the index to sample from the different panels
cumTime <- c(0, cumsum (Time));
index <- findInterval (seq (1:rows), cumTime, left.open = TRUE);
# draw R individual bootstrap samples
bootList <- replicate(R = 5, list(), simplify=F);
bootList <- lapply (bootList, function(x) by (data, INDICES = index, FUN = function(x) dplyr::sample_n (tbl = x, size = dim(x)[1], replace = T)));
---------- UNLISTING ---------
Currently, I try do it incorrectly like this:
Example for just 1 entry of the list:
matrix(unlist(bootList[[1]], recursive = T), ncol = cols)
The desired output is just
bootList[[1]]
as a matrix.
Do you have an idea how to do this & if possible reasonably efficient?
The matrices are then processed in unfortunately slow MLE estimations...
i found a solution for you. From what i gather, you have a Dataframe containing all observations of all companies, which may have different panel lengths. And as a result you would like to have a Bootstap sample for each company of same size as the original panel length.
You mearly have to add a company indicator
data$company = c(rep(1, 10), rep(2, 5)) # this could even be a factor.
L1 = split(data, data$company)
L2 = lapply(L1, FUN = function(s) s[sample(x = 1:nrow(s), size = nrow(s), replace = TRUE),] )
stop here if you would like to have saperate bootstap samples e.g. in case you want to estimate seperately
bootdata = do.call(rbind, L2)
Best wishes,
Tim

Vectorization of a nested for-loop that inputs all paired combinations

I thought that the following problem must have been answered or a function must exist to do it, but I was unable to find an answer.
I have a nested loop that takes a row from one 3-col. data frame and copies it next to each of the other rows, to form a 6-col. data frame (with all possible combinations). This works fine, but with a medium sized data set (800 rows), the loops take forever to complete the task.
I will demonstrate on a sample data set:
Sdat <- data.frame(
x = c(10,20,30,40),
y = c(15,25,35,45),
ID =c(1,2,3,4)
)
compar <- data.frame(matrix(nrow=0, ncol=6)) # to contain all combinations
names(compar) <- c("x","y", "ID", "x","y", "ID")
N <- nrow(Sdat) # how many different points we have
for (i in 1:N)
{
for (j in 1:N)
{
Temp1 <- Sdat[i,] # data from 1st point
Temp2 <- Sdat[j,] # data from 2nd point
C <- cbind(Temp1, Temp2)
compar <- rbind(C,compar)
}
}
These loops provide exactly the output that I need for further analysis. Any suggestion for vectorizing this section?
You can do:
ind <- seq_len(nrow(Sdat))
grid <- expand.grid(ind, ind)
compar <- cbind(Sdat[grid[, 1], ], Sdat[grid[, 2], ])
A naive solution using rep (assuming you are happy with a data frame output):
compar <- data.frame(x = rep(Sdat$x, each = N),
y = rep(Sdat$y, each = N),
id = rep(1:n, each = N),
x1 = rep(Sdat$x, N),
y1 = rep(Sdat$y, N),
id_1 = rep(1:n, N))

How to collapse branches in a phylogenetic tree by the label in their nodes or leaves?

I have built a phylogenetic tree for a protein family that can be split into different groups, classifying each one by its type of receptor or type of response. The nodes in the tree are labeled as the type of receptor.
In the phylogenetic tree I can see that proteins that belong to the same groups or type of receptor have clustered together in the same branches. So I would like to collapse these branches that have labels in common, grouping them by a given list of keywords.
The command would be something like this:
./collapse_tree_by_label -f phylogenetic_tree.newick -l list_of_labels_to_collapse.txt -o collapsed_tree.eps(or pdf)
My list_of_labels_to_collapse.txt would be like this:
A
B
C
D
My newick tree would be like this:
(A_1:0.05,A_2:0.03,A_3:0.2,A_4:0.1):0.9,(((B_1:0.05,B_2:0.02,B_3:0.04):0.6,(C_1:0.6,C_2:0.08):0.7):0.5,(D_1:0.3,D_2:0.4,D_3:0.5,D_4:0.7,D_5:0.4):1.2)
The output image without collapsing is like this:
http://i.stack.imgur.com/pHkoQ.png
The output image collapsing should be like this (collapsed_tree.eps):
http://i.stack.imgur.com/TLXd0.png
The width of the triangles should represent the branch length, and the high of the triangles must represent the number of nodes in the branch.
I have been playing with the "ape" package in R. I was able to plot a phylogenetic tree, but I still can't figure out how to collapse the branches by keywords in the labels:
require("ape")
This will load the tree:
cat("((A_1:0.05,A_2:0.03,A_3:0.2,A_4:0.1):0.9,(((B_1:0.05,B_2:0.02,B_3:0.04):0.6,(C_1:0.6,C_2:0.08):0.7):0.5,(D_1:0.3,D_2:0.4,D_3:0.5,D_4:0.7,D_5:0.4):1.2):0.5);", file = "ex.tre", sep = "\n")
tree.test <- read.tree("ex.tre")
Here should be the code to collapse
This will plot the tree:
plot(tree.test)
Your tree as it is stored in R already has the tips stored as polytomies. It's just a matter of plotting the tree with triangles representing the polytomies.
There is no function in ape to do this, that I am aware of, but if you mess with the plotting function a little bit you can pull it off
# Step 1: make edges for descendent nodes invisible in plot:
groups <- c("A", "B", "C", "D")
group_edges <- numeric(0)
for(group in groups){
group_edges <- c(group_edges,getMRCA(tree.test,tree.test$tip.label[grepl(group, tree.test$tip.label)]))
}
edge.width <- rep(1, nrow(tree.test$edge))
edge.width[tree.test$edge[,1] %in% group_edges ] <- 0
# Step 2: plot the tree with the hidden edges
plot(tree.test, show.tip.label = F, edge.width = edge.width)
# Step 3: add triangles
add_polytomy_triangle <- function(phy, group){
root <- length(phy$tip.label)+1
group_node_labels <- phy$tip.label[grepl(group, phy$tip.label)]
group_nodes <- which(phy$tip.label %in% group_node_labels)
group_mrca <- getMRCA(phy,group_nodes)
tip_coord1 <- c(dist.nodes(phy)[root, group_nodes[1]], group_nodes[1])
tip_coord2 <- c(dist.nodes(phy)[root, group_nodes[1]], group_nodes[length(group_nodes)])
node_coord <- c(dist.nodes(phy)[root, group_mrca], mean(c(tip_coord1[2], tip_coord2[2])))
xcoords <- c(tip_coord1[1], tip_coord2[1], node_coord[1])
ycoords <- c(tip_coord1[2], tip_coord2[2], node_coord[2])
polygon(xcoords, ycoords)
}
Then you just have to loop through the groups to add the triangles
for(group in groups){
add_polytomy_triangle(tree.test, group)
}
I've also been searching for this kind of tool for ages, not so much for collapsing categorical groups, but for collapsing internal nodes based on a numerical support value.
The di2multi function in the ape package can collapse nodes to polytomies, but it currently can only does this by branch length threshold.
Here is a rough adaptation that allows collapsing by a node support value threshold instead (default threshold = 0.5).
Use at your own risk, but it works for me on my rooted Bayesian tree.
di2multi4node <- function (phy, tol = 0.5)
# Adapted di2multi function from the ape package to plot polytomies
# based on numeric node support values
# (di2multi does this based on edge lengths)
# Needs adjustment for unrooted trees as currently skips the first edge
{
if (is.null(phy$edge.length))
stop("the tree has no branch length")
if (is.na(as.numeric(phy$node.label[2])))
stop("node labels can't be converted to numeric values")
if (is.null(phy$node.label))
stop("the tree has no node labels")
ind <- which(phy$edge[, 2] > length(phy$tip.label))[as.numeric(phy$node.label[2:length(phy$node.label)]) < tol]
n <- length(ind)
if (!n)
return(phy)
foo <- function(ancestor, des2del) {
wh <- which(phy$edge[, 1] == des2del)
for (k in wh) {
if (phy$edge[k, 2] %in% node2del)
foo(ancestor, phy$edge[k, 2])
else phy$edge[k, 1] <<- ancestor
}
}
node2del <- phy$edge[ind, 2]
anc <- phy$edge[ind, 1]
for (i in 1:n) {
if (anc[i] %in% node2del)
next
foo(anc[i], node2del[i])
}
phy$edge <- phy$edge[-ind, ]
phy$edge.length <- phy$edge.length[-ind]
phy$Nnode <- phy$Nnode - n
sel <- phy$edge > min(node2del)
for (i in which(sel)) phy$edge[i] <- phy$edge[i] - sum(node2del <
phy$edge[i])
if (!is.null(phy$node.label))
phy$node.label <- phy$node.label[-(node2del - length(phy$tip.label))]
phy
}
This is my answer based on phytools::phylo.toBackbone function,
see http://blog.phytools.org/2013/09/even-more-on-plotting-subtrees-as.html, and http://blog.phytools.org/2013/10/finding-edge-lengths-of-all-terminal.html. First, load the function at the end of code.
library(ape)
library(phytools) #phylo.toBackbone
library(phangorn)
cat("((A_1:0.05,E_2:0.03,A_3:0.2,A_4:0.1,A_5:0.1,A_6:0.1,A_7:0.35,A_8:0.4,A_9:01,A_10:0.2):0.9,((((B_1:0.05,B_2:0.05):0.5,B_3:0.02,B_4:0.04):0.6,(C_1:0.6,C_2:0.08):0.7):0.5,(D_1:0.3,D_2:0.4,D_3:0.5,D_4:0.7,D_5:0.4):1.2):0.5);"
, file = "ex.tre", sep = "\n")
phy <- read.tree("ex.tre")
groups <- c("A", "B|C", "D")
backboneoftree<-makebackbone(groups,phy)
# tip.label clade.label N depth
# 1 A_1 A 10 0.2481818
# 2 B_1 B|C 6 0.9400000
# 3 D_1 D 5 0.4600000
{
tryCatch(dev.off(),error=function(e){""})
par(fig=c(0,0.5,0,1), mar = c(0, 0, 2, 0))
plot(phy, main="Original" )
par(fig=c(0.5,1,0,1), oma = c(0, 0, 1.2, 0), xpd=NA, new=T)
plot(backboneoftree)
title(main="Clades")
}
makebackbone <- function(groupings,phy){
listofspecies <- phy$tip.label
listtopreserve <- character()
newedgelengths <- meandistnode<- lengthofclades<- numeric()
for (i in 1:length(groupings)){
bestmrca<-getMRCA(phy,grep(groupings[i], phy$tip.label) )
mrcatips<-phy$tip.label[unlist(phangorn::Descendants(phy,bestmrca, type="tips") )]
listtopreserve[i] <- mrcatips[1]
meandistnode[i] <- mean(dist.nodes(phy)[unlist(lapply(mrcatips,
function(x) grep(x, phy$tip.label) ) ),bestmrca] )
lengthofclades[i] <- length(mrcatips)
provtree <- drop.tip(phy,mrcatips, trim.internal=F, subtree = T)
n3 <- length(provtree$tip.label)
newedgelengths[i] <- setNames(provtree$edge.length[sapply(1:n3,function(x,y)
which(y==x),
y=provtree$edge[,2])],
provtree$tip.label)[provtree$tip.label[grep("tips",provtree$tip.label)] ]
}
newtree <- drop.tip(phy,setdiff(listofspecies,listtopreserve),
trim.internal = T)
n <- length(newtree$tip.label)
newtree$edge.length[sapply(1:n,function(x,y)
which(y==x),
y=newtree$edge[,2])] <- newedgelengths + meandistnode
trans <- data.frame(tip.label=newtree$tip.label,clade.label=groupings,
N=lengthofclades, depth=meandistnode )
rownames(trans) <- NULL
print(trans)
backboneoftree <- phytools::phylo.toBackbone(newtree,trans)
return(backboneoftree)
}
EDIT: I haven't tried this, but it might be another answer: "Script and function to transform the tip branches of a tree , i.e the thickness or to triangles, with the width of both correlating with certain parameters (e.g., species number of the clade) (tip.branches.R)"
https://www.en.sysbot.bio.lmu.de/people/employees/cusimano/use_r/index.html
I think the script is finally doing what I wanted.
From the answer that #CactusWoman provided, I changed the code a little bit so the script will try to find the MRCA that represents the largest branch that matches to my search pattern. This solved the problem of not merging non-polytomic branches, or collapsing the whole tree because one matching node was mistakenly outside the correct branch.
In addition, I included a parameter that represents the limit for the pattern abundance ratio in a given branch, so we can select and collapse/group branches that have at least 90% of its tips matching to the search pattern, for example.
library(geiger)
library(phylobase)
library(ape)
#functions
find_best_mrca <- function(phy, group, threshold){
group_matches <- phy$tip.label[grepl(group, phy$tip.label, ignore.case=TRUE)]
group_mrca <- getMRCA(phy,phy$tip.label[grepl(group, phy$tip.label, ignore.case=TRUE)])
group_leaves <- tips(phy, group_mrca)
match_ratio <- length(group_matches)/length(group_leaves)
if( match_ratio < threshold){
#start searching for children nodes that have more than 95% of descendants matching to the search pattern
mrca_children <- descendants(as(phy,"phylo4"), group_mrca, type="all")
i <- 1
new_ratios <- NULL
nleaves <- NULL
names(mrca_children) <- NULL
for(new_mrca in mrca_children){
child_leaves <- tips(tree.test, new_mrca)
child_matches <- grep(group, child_leaves, ignore.case=TRUE)
new_ratios[i] <- length(child_matches)/length(child_leaves)
nleaves[i] <- length(tips(phy, new_mrca))
i <- i+1
}
match_result <- data.frame(mrca_children, new_ratios, nleaves)
match_result_sorted <- match_result[order(-match_result$nleaves,match_result$new_ratios),]
found <- numeric(0);
print(match_result_sorted)
for(line in 1:nrow(match_result_sorted)){
if(match_result_sorted$ new_ratios[line]>=threshold){
return(match_result_sorted$mrca_children[line])
found <- 1
}
}
if(found==0){return(found)}
}else{return(group_mrca)}
}
add_triangle <- function(phy, group,phylo_plot){
group_node_labels <- phy$tip.label[grepl(group, phy$tip.label)]
group_mrca <- getMRCA(phy,group_node_labels)
group_nodes <- descendants(as(tree.test,"phylo4"), group_mrca, type="tips")
names(group_nodes) <- NULL
x<-phylo_plot$xx
y<-phylo_plot$yy
x1 <- max(x[group_nodes])
x2 <-max(x[group_nodes])
x3 <- x[group_mrca]
y1 <- min(y[group_nodes])
y2 <- max(y[group_nodes])
y3 <- y[group_mrca]
xcoords <- c(x1,x2,x3)
ycoords <- c(y1,y2,y3)
polygon(xcoords, ycoords)
return(c(x2,y3))
}
#main
cat("((A_1:0.05,E_2:0.03,A_3:0.2,A_4:0.1,A_5:0.1,A_6:0.1,A_7:0.35,A_8:0.4,A_9:01,A_10:0.2):0.9,((((B_1:0.05,B_2:0.05):0.5,B_3:0.02,B_4:0.04):0.6,(C_1:0.6,C_2:0.08):0.7):0.5,(D_1:0.3,D_2:0.4,D_3:0.5,D_4:0.7,D_5:0.4):1.2):0.5);", file = "ex.tre", sep = "\n")
tree.test <- read.tree("ex.tre")
# Step 1: Find the best MRCA that matches to the keywords or search patten
groups <- c("A", "B|C", "D")
group_labels <- groups
group_edges <- numeric(0)
edge.width <- rep(1, nrow(tree.test$edge))
count <- 1
for(group in groups){
best_mrca <- find_best_mrca(tree.test, group, 0.90)
group_leaves <- tips(tree.test, best_mrca)
groups[count] <- paste(group_leaves, collapse="|")
group_edges <- c(group_edges,best_mrca)
#Step2: Remove the edges of the branches that will be collapsed, so they become invisible
edge.width[tree.test$edge[,1] %in% c(group_edges[count],descendants(as(tree.test,"phylo4"), group_edges[count], type="all")) ] <- 0
count = count +1
}
#Step 3: plot the tree hiding the branches that will be collapsed/grouped
last_plot.phylo <- plot(tree.test, show.tip.label = F, edge.width = edge.width)
#And save a copy of the plot so we can extract the xy coordinates of the nodes
#To get the x & y coordinates of a plotted tree created using plot.phylo
#or plotTree, we can steal from inside tiplabels:
last_phylo_plot<-get("last_plot.phylo",envir=.PlotPhyloEnv)
#Step 4: Add triangles and labels to the collapsed nodes
for(i in 1:length(groups)){
text_coords <- add_triangle(tree.test, groups[i],last_phylo_plot)
text(text_coords[1],text_coords[2],labels=group_labels[i], pos=4)
}
This doesn't address depicting the clades as triangles, but it does help with collapsing low-support nodes. The library ggtree has a function as.polytomy which can be used to collapse nodes based on support values.
For example, to collapse bootstraps less than 50%, you'd use:
polytree = as.polytomy(raxtree, feature='node.label', fun=function(x) as.numeric(x) < 50)

Combining frequencies and summary statistics in one table?

I just discovered the power of plyr frequency table with several variables in R
and I am still struggling to understand how it works and I hope some here can help me.
I would like to create a table (data frame) in which I can combine frequencies and summary stats but without hard-coding the values.
Here an example dataset
require(datasets)
d1 <- sleep
# I classify the variable extra to calculate the frequencies
extraClassified <- cut(d1$extra, breaks = 3, labels = c('low', 'medium', 'high') )
d1 <- data.frame(d1, extraClassified)
The results I am looking for should look like that :
require(plyr)
ddply(d1, "group", summarise,
All = length(ID),
nLow = sum(extraClassified == "low"),
nMedium = sum(extraClassified == "medium"),
nHigh = sum(extraClassified == "high"),
PctLow = round(sum(extraClassified == "low")/ length(ID), digits = 1),
PctMedium = round(sum(extraClassified == "medium")/ length(ID), digits = 1),
PctHigh = round(sum(extraClassified == "high")/ length(ID), digits = 1),
xmean = round(mean(extra), digits = 1),
xsd = round(sd(extra), digits = 1))
My question: how can I do this without hard-coding the values?
For the records:
I tried this code, but it does not work
ddply (d1, "group",
function(i) c(table(i$extraClassified),
prop.table(as.character(i$extraClassified))),
)
Thanks in advance
Here's an example to get you started:
foo <- function(x,colfac,colval){
tbl <- table(x[,colfac])
res <- cbind(n = nrow(x),t(tbl),t(prop.table(tbl)))
colnames(res)[5:7] <- paste(colnames(res)[5:7],"Pct",sep = "")
res <- as.data.frame(res)
res$mn <- mean(x[,colval])
res$sd <- sd(x[,colval])
res
}
ddply(d1,.(group),foo,colfac = "extraClassified",colval = "extra")
Don't take anything in that function foo as gospel. I just wrote that off the top of my head. Surely improvements/modifications are possible, but at least it's something to start with.
Thanks to Joran.
I slighlty modified your function to make it more generic (without reference to the position of the variables) .
require(plyr)
foo <- function(x,colfac,colval)
{
# table with frequencies
tbl <- table(x[,colfac])
# table with percentages
tblpct <- t(prop.table(tbl))
colnames( tblpct) <- paste(colnames(t(tbl)), 'Pct', sep = '')
# put the first part together
res <- cbind(n = nrow(x), t(tbl), tblpct)
res <- as.data.frame(res)
# add summary statistics
res$mn <- mean(x[,colval])
res$sd <- sd(x[,colval])
res
}
ddply(d1,.(group),foo,colfac = "extraClassified",colval = "extra")
and it works !!!
P.S : I still do not understand what (group) stands for but

Resources