Altering padding when combining tableGrob and ggplot objects in R - r

I am trying to combine multiple tableGrob objects with a ggplot object into a single .png file; I am having trouble understanding how to edit the tableGrob theme parameters in a way that lets me adjust the padding and dimensions of the table objects. Ideally, I want them to be in a 4*1 grid, with minimal padding between each. The text of the table objects should also be left-justified.
I am working with dummy data, and each row of my input dataset will be used to create its own .png file (two rows are included in the code snippet below to generate a reproducible example).
I tried to use this post as an example, and set the grid.arrange spacing based on the "heights" attributes of each table object, but this hasn't quite done the trick. As a side note, right now, the plot gets overwritten each time; I'll fix this later, am just concerned with getting the output dimensions/arrangement correct. Code is below; edited to include library calls, and fixed a typo:
require("ggplot2")
require("gridExtra")
require("grid")
# Generate dummy data frame
sampleVector <- c("1", "Amazing", "Awesome", "0.99", "0.75", "0.5", "$5,000.00", "0.55", "0.75", "0.31", "0.89", "0.25", "Strong community support", "Strong leadership", "Partners had experience", "", "CBO not supportive", "Limited experience", "Limited monitoring", "")
sampleVectorB <- c("3", "Amazing", "Awesome", "0.99", "0.75", "0.5", "$5,000.00", "0.55", "0.75", "0.31", "0.89", "0.25", "Strong community support", "Strong leadership", "Partners had experience", "", "CBO not supportive", "Limited experience", "Limited monitoring", "")
sampleDF <- data.frame(rbind(sampleVector, sampleVectorB))
colnames(sampleDF) <- c("CBO", "PMQ", "HMQ", "ER", "PR", "HR", "NS", "CTI", "Home and Hosp", "Home", "Phone", "Other", "S1", "S2", "S3", "S4", "C1", "C2", "C3", "C4")
indata <- sampleDF
#Finds the longest string from a vector of strings (i.e. #chars, incld. whitespace); returns index into this vector that corresponds to this string
findMax <- function(tempVector){
tempMaxIndex <- 1
for(i in 1:length(tempVector)){
print(nchar(tempVector[i]))
if(nchar(tempVector[i]) > nchar(tempVector[tempMaxIndex])){
tempMaxIndex <- i
}
}
return(tempMaxIndex)
}
# Same as above but w/o the colon:
addWhitespacePlain <- function(stringVec, maxNum){
for(i in 1:length(stringVec))
{
string <- stringVec[i]
while(nchar(string) < maxNum+1){
string <- paste(string, " ")
}
stringVec[i] <- string
}
return(stringVec)
}
staticText <- c("Participant Match Quality", "Hospital-Level Match Quality", "Enrollment Rate", "Participant Readmissions", "Hospital Readmissions", "Net Savings",
"Strengths", "Challenges")
m <- findMax(staticText)
staticText <- addWhitespacePlain(staticText, nchar(staticText[m]))
# Loop through our input data and keep only one CBO each pass
for(i in 1:length(indata$CBO)){
# Select only the row that has this CBO's data
temp <- indata[i,]
###############################################################################################
# MAKE TOP TEXT TABLE (as a DF)
# Get values from our input data set to fill in the values for the top text portion of the graphic
topVals <- t(data.frame(temp[2], temp[3], temp[4], temp[5], temp[6], temp[7]))
topDF <- data.frame(staticText[1:6], topVals, row.names=NULL)
colnames(topDF) <- c(paste("CBO", temp[1]), " ")
# Find which of the strings from the top text portion is the longest (i.e. max # of chars; including whitespace)
m2 <- findMax(rownames(topDF)) # returns an index into the vector; this index corresponds to string w/max num of chars
# Add whitespace to non-max strings so all have the same length and also include colons
rownames(topDF) <- addWhitespacePlain(rownames(topDF), nchar(rownames(topDF)[m2]))
# for testing
# print(topDF, right=FALSE)
###############################################################################################
# MAKE BAR CHART
#Subset the data to select the vars we need for the horizontal bar plot
graphdata <- t(data.frame(temp[,8:12]))
vars <- c("CTI", "Home & Hosp.", "Home", "Phone", "Other")
graphDF <- data.frame(vars, graphdata, row.names = NULL)
colnames(graphDF) <- c("vars", "values")
# Make the plot (ggplot object)
barGraph <- ggplot(graphDF, aes(x=vars, y=values,fill=factor(vars))) +
geom_bar(stat = "identity") +
theme(axis.title.y=element_blank())+
theme(legend.position="none")+
coord_flip()
# print(barGraph)
###############################################################################################
# MAKE BOTTOM TEXT TABLE
strengths <- t(data.frame(temp[13], temp[14], temp[15], temp[16]))
challenges <- t(data.frame(temp[17], temp[18], temp[19], temp[20]))
#Drop nulls
strengths <- data.frame(strengths[which(!is.na(strengths)),], row.names=NULL)
challenges <- data.frame(challenges[which(!is.na(challenges)),], row.names=NULL)
colnames(strengths) <- c(staticText[7])
colnames(challenges) <- c(staticText[8])
###############################################################################################
# OUTPUT (padding not resolved yet)
# Set the path for the combined image
png("test1", height=1500, width=1000)
#customTheme <- ttheme_minimal(core=list(fg_params=list(hjust=0, x=0.1)),
# rowhead=list(fg_params=list(hjust=0, x=0)))
# top<-tableGrob(topDF, theme=customTheme)
# bottom_strength <- tableGrob(strengths, theme=customTheme)
# bottom_challenges <- tableGrob(challenges, theme=customTheme)
top<-tableGrob(topDF)
bottom_strength <- tableGrob(strengths)
bottom_challenges <- tableGrob(challenges)
x <- sum(top$heights)
y <- sum(bottom_strength$heights)
z <- sum(bottom_challenges$heights)
grid.arrange(top, barGraph, bottom_strength, bottom_challenges,
as.table=TRUE,
heights=c(2, 1, 2, 2),
nrow = 4)
# heights= unit.c(x, unit(1), y, z))
dev.off()
}

Your example is too complicated, I'm not sure what exactly is the issue. The following 4x1 layout has zero padding, is that what you're after?
ta <- tableGrob(iris[1:4,1:2])
tb <- tableGrob(mtcars[1:3,1:3])
tc <- tableGrob(midwest[1:5,1:2])
p <- qplot(1,1) + theme(plot.background=element_rect(colour = "black"))
h <- unit.c(sum(ta$heights), unit(1,"null"), sum(tb$heights), sum(tc$heights))
grid.newpage()
grid.arrange(ta,p,tb,tc, heights=h)

Related

Flextable: change cell value if it meets a condition and formatting

I have a dataset (called example) like the following one.
mic <- rep(c("One", "Two", "Tree", "Four"), each = 3)
pap <- rep(c("1", "2", "3", "4"), each = 3)
ref <- rep(c("Trial 1", "Trial 2", "Trial 3", "Trial 4"), each = 3)
prob <- c(rep(NA,4), "Nogood", NA, "Bad", "Nogood", "Norel", NA, "Bad", "Nogood")
example <- data.frame(Micro = mic, Paper = pap, Reference = ref, Problem = prob)
example
Example
I would like to merge cells vertically when consecutive cells have identical
values so I use flextable merge_v() function.
ft_example <- example %>%
flextable() %>%
merge_v(j = ~ Micro + Paper + Reference + Problem) %>%
theme_vanilla()
ft_example
I obtain the following table when knitting in Word:
Table obtained
Is there a way to:
Insert a posteriori the value "None identified" in the empty
cells in the "Problem" field that are merged together; and
Remove the inappropriate horizontal lines in the "Problem" field when
there is one (or more) not empty cells and some empty cells so that
there is one horizontal line clearly separating each combination of
Micro, Paper, Reference and horizontal lines separating only non
empty cells in the Problem field?
You can see the desired result here below:
Table desired

Word cloud in R with multiple words and special characters

I want to create a wordcloud with R. I want to visualize the occurence of variable names, which may consist of more than one word and also special characters and numbers, for example one variable name is "S & P 500 dividend yield".
The variable names are in a text file and they are no further separated. Every line of the text file contains a new variable name.
I tried the folowing code, however the variable names are split into different characters:
library(tm)
library(SnowballC)
library(wordcloud)
library(RColorBrewer)
# load the text:
text <- readLines("./Overview_used_series.txt")
docs <- Corpus(VectorSource(text))
inspect(docs)
# build a term-document matrix:
tdm <- TermDocumentMatrix(docs)
m <- as.matrix(tdm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
# generate the wordcloud:
pdf("Word cloud.pdf")
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
dev.off()
How can I treat the variable names, so that they are visualized in the wordcloud with their original names as in the text file?
If you have a file as you specified with a variable name per line, there is no need to use tm. You can easily create your own word frequency table to use as input. When using tm, it will split words based a space and will not respect your variable names.
Starting from when the text is loaded, just create a data.frame with where frequency is set to 1 and then you can just aggregate everything. wordcloud also accepts data.frame like this and you can just create a wordcloud from this. Note that I adjusted the scale a bit, because when you have long variable names, they might not get printed. You will get a warning message when this happens.
I'm not inserting the resulting picture.
#text <- readLines("./Overview_used_series.txt")
text <- c("S & P 500 dividend yield", "S & P 500 dividend yield", "S & P 500 dividend yield",
"visualize ", "occurence ", "variable names", "visualize ", "occurence ",
"variable names")
# freq = 1 adds a columns with just 1's for every value.
my_data <- data.frame(text = text, freq = 1, stringsAsFactors = FALSE)
# aggregate the data.
my_agr <- aggregate(freq ~ ., data = my_data, sum)
wordcloud(words = my_agr$text, freq = my_agr$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"), scale = c(2, .5))

R dashboard, googlevis options

here is my Q:
I am using googlevis (gvisTreeMap) function to illustrate a data frame. it is automatically put the labels on each block, but I want to have the numbers too.
here is my simplified code:
col1 <- c(1,2,3,5,8)
col2 <- c("a","b","c","d","e")
fdata <- data.frame(col1,col2)
total <- data.frame(col1=sum(fdata$col1), col2="Market Share")
fdata1 <- rbind(total, fdata)
fdata1$parent="Market Share"
## Set parent variable to NA at root level
fdata1$parent[fdata1$col2=="Market Share"] <- NA
fdata1$col1.log=log(fdata1$col1)
aa <- gvisTreeMap(fdata1, "col2", "parent",
"col1", "col1.log",
options=list(width=600, height=500,
fontSize=16,
minColor='#EDF8FB',
midColor='#66C2A4',
maxColor='#006D2C',
headerHeight=20,
fontColor='black',
showScale=TRUE, lable="$$"))
plot(aa)
to make it more clear, after I run the code I have five blocks with letters on it, but I want both letters and numbers.
Thanks
I changed your definition of col2 to append the size to each entry (with the exception of the root node):
fdata1$col2=paste0(fdata1$col2, " - ", fdata1$col1)
fdata$col2[1] <- "Market Share"

Suppressing column names within a matrix and looping function

I am attempting to create my first custom function in R (yay!). I've got something that sort of works now but I think it could be improved.
Basically, I want to create my own custom table within R that can be run through xtable for a final report. I want the table to follow this format for each column:
group1mean, group1sd, group2mean, group2sd, t-value, p-value.
At current, my function does this. However, it produces column names (e.g., V3 and V4) that I would like to leave blank and I would like to have it loop through multiple dependent variables and append the results as new rows in the matrix automatically. Right now, I have to write a line of code for each dependent variable manually (in the example below the DVs are PWB, SWB, and EWB.
Here is my code so far:
data <- read.delim("~/c4044sol.txt", header=T)
library(psych)
proc.ttest <- function(dv,group,decimals) {
x1 <- describeBy((dv), (group), mat=TRUE)
stat1 <- t.test((dv) ~ (group))
output1 <- c(paste (round(x1$mean[1], digits=(decimals)),"(", round(x1$sd[1], digits= (decimals)), ")", sep =" "),
paste (round(x1$mean[2], digits=(decimals)), "(", round(x1$sd[2], digits=(decimals)), ")", sep =" "),
round(stat1$statistic, digits=2), round(stat1$p.value, digits=3))
return(output1)
}
toprow <- c("M (SD)", "M (SD)", "t", "p")
outtable <- rbind(toprow,
proc.ttest(data$PWB, data$college, 2),
proc.ttest(data$SWB, data$college, 2),
proc.ttest(data$EWB, data$college, 2))
colnames(outtable) <- c("College graduate", "Less than college graduate", "", "")
row.names(outtable) <- c("", "PWB", "SWB", "EWB")
library(xtable)
xtable(outtable)
So to repeat, I would like to suppress the column names "V3" and "V4" (leave them blank) and make the code run automatically on a list of variables. Are either of these things possible? Thanks for your time.
Try keeping outtable as you have it, but without toprow.
Instead, use toprow as the names:
toprow <- c("M (SD)", "M (SD)", "t", "p")
outtable <- rbind( # toprow,
proc.ttest(data$PWB, data$college, 2),
proc.ttest(data$SWB, data$college, 2),
proc.ttest(data$EWB, data$college, 2))
names(outtable) <- toprow
## note that the parens and spaces are
## not best practices, but this should still
## get your your desired results
I fixed the extra column labels printing issue by putting all the labels I actually wanted in the final table in the first two rows of the matrix...
toptoprow <- c("College graduate", "Less than college graduate", "", "")
toprow <- c("M (SD)", "M (SD)", "t", "p")
outtable <- rbind(toptoprow,toprow, proc.ttest(PWB, college, 2),
proc.ttest(SWB, college, 2),
proc.ttest(EWB, college, 2))
And then suppressing the colnames using the print function (as suggested by Ricardo)...
print(xtable(outtable), hline.after=c(-1,1,nrow(outtable)),include.colnames=FALSE)
I still would like to automate the function itself so I can ideally give it a list of variable names, it will run the function on each variable, and populate the results in the final matrix. But one baby step at at time...

Using R to parse out Surveymonkey csv files

I'm trying to analyse a large survey created with surveymonkey which has hundreds of columns in the CSV file and the output format is difficult to use as the headers run over two lines.
Has anybody found a simple way of managing the headers in the CSV file so that the analysis is manageable ?
How do other people analyse results from Surveymonkey?
Thanks!
You can export it in a convenient form that fits R from Surveymonkey, see download responses in 'Advanced Spreadsheet Format'
What I did in the end was print out the headers using libreoffice labeled as V1,V2, etc. then I just read in the file as
m1 <- read.csv('Sheet1.csv', header=FALSE, skip=1)
and then just did the analysis against m1$V10, m1$V23 etc...
To get around the mess of multiple columns I used the following little function
# function to merge columns into one with a space separator and then
# remove multiple spaces
mcols <- function(df, cols) {
# e.g. mcols(df, c(14:18))
exp <- paste('df[,', cols, ']', sep='', collapse=',' )
# this creates something like...
# "df[,14],df[,15],df[,16],df[,17],df[,18]"
# now we just want to do a paste of this expression...
nexp <- paste(" paste(", exp, ", sep=' ')")
# so now nexp looks something like...
# " paste( df[,14],df[,15],df[,16],df[,17],df[,18] , sep='')"
# now we just need to parse this text... and eval() it...
newcol <- eval(parse(text=nexp))
newcol <- gsub(' *', ' ', newcol) # replace duplicate spaces by a single one
newcol <- gsub('^ *', '', newcol) # remove leading spaces
gsub(' *$', '', newcol) # remove trailing spaces
}
# mcols(df, c(14:18))
No doubt somebody will be able to clean this up!
To tidy up Likert-like scales I used:
# function to tidy c('Strongly Agree', 'Agree', 'Disagree', 'Strongly Disagree')
tidylik4 <- function(x) {
xlevels <- c('Strongly Disagree', 'Disagree', 'Agree', 'Strongly Agree')
y <- ifelse(x == '', NA, x)
ordered(y, levels=xlevels)
}
for (i in 44:52) {
m2[,i] <- tidylik4(m2[,i])
}
Feel free to comment as no doubt this will come up again!
I have to deal with this pretty frequently, and having the headers on two columns is a bit painful. This function fixes that issue so that you only have a 1 row header to deal with. It also joins the multipunch questions so you have top: bottom style naming.
#' #param x The path to a surveymonkey csv file
fix_names <- function(x) {
rs <- read.csv(
x,
nrows = 2,
stringsAsFactors = FALSE,
header = FALSE,
check.names = FALSE,
na.strings = "",
encoding = "UTF-8"
)
rs[rs == ""] <- NA
rs[rs == "NA"] <- "Not applicable"
rs[rs == "Response"] <- NA
rs[rs == "Open-Ended Response"] <- NA
nms <- c()
for(i in 1:ncol(rs)) {
current_top <- rs[1,i]
current_bottom <- rs[2,i]
if(i + 1 < ncol(rs)) {
coming_top <- rs[1, i+1]
coming_bottom <- rs[2, i+1]
}
if(is.na(coming_top) & !is.na(current_top) & (!is.na(current_bottom) | grepl("^Other", coming_bottom)))
pre <- current_top
if((is.na(current_top) & !is.na(current_bottom)) | (!is.na(current_top) & !is.na(current_bottom)))
nms[i] <- paste0(c(pre, current_bottom), collapse = " - ")
if(!is.na(current_top) & is.na(current_bottom))
nms[i] <- current_top
}
nms
}
If you note, it returns the names only. I typically just read.csv with ...,skip=2, header = FALSE, save to a variable and overwrite the names of the variable. It also helps ALOT to set your na.strings and stringsAsFactor = FALSE.
nms = fix_names("path/to/csv")
d = read.csv("path/to/csv", skip = 2, header = FALSE)
names(d) = nms
As of November 2013, the webpage layout seems to have changed. Choose Analyze results > Export All > All Responses Data > Original View > XLS+ (Open in advanced statistical and analytical software). Then go to Exports and download the file. You'll get raw data as first row = question headers / each following row = 1 response, possibly split between multiple files if you have many responses / questions.
The issue with the headers is that columns with "select all that apply" will have a blank top row, and the column heading will be the row below. This is only an issue for those types of questions.
With this in mind, I wrote a loop to go through all columns and replace the column names with the value from the second row if the column name was blank- which has a character length of 1.
Then, you can kill the second row of the data and have a tidy data frame.
for(i in 1:ncol(df)){
newname <- colnames(df)[i]
if(nchar(newname) < 2){
colnames(df)[i] <- df[1,i]
}
df <- df[-1,]
Coming to the party late, but this is still an issue and the best workaround I've found is using a function to paste the column names and sub-column names together, based on repeating values.
For instance, if exporting to .csv, the repeated column names will automatically be replaced with an X in RStudio. If exporting to .xlsx, the repeated value will be ....
Here's a base R solution:
sm_header_function <- function(x, rep_val){
orig <- x
sv <- x
sv <- sv[1,]
sv <- sv[, sapply(sv, Negate(anyNA)), drop = FALSE]
sv <- t(sv)
sv <- cbind(rownames(sv), data.frame(sv, row.names = NULL))
names(sv)[1] <- "name"
names(sv)[2] <- "value"
sv$grp <- with(sv, ave(name, FUN = function(x) cumsum(!startsWith(name, rep_val))))
sv$new_value <- with(sv, ave(name, grp, FUN = function(x) head(x, 1)))
sv$new_value <- paste0(sv$new_value, " ", sv$value)
new_names <- as.character(sv$new_value)
colnames(orig)[which(colnames(orig) %in% sv$name)] <- sv$new_value
orig <- orig[-c(1),]
return(orig)
}
sm_header_function(df, "X")
sm_header_function(df, "...")
With some sample data, the change in column names would look like this:
Original export from SurveyMonkey:
> colnames(sample)
[1] "Respondent ID" "Please provide your contact information:" "...11"
[4] "...12" "...13" "...14"
[7] "...15" "...16" "...17"
[10] "...18" "...19" "I wish it would have snowed more this winter."
Cleaned export from SurveyMonkey:
> colnames(sample_clean)
[1] "Respondent ID" "Please provide your contact information: Name"
[3] "Please provide your contact information: Company" "Please provide your contact information: Address"
[5] "Please provide your contact information: Address 2" "Please provide your contact information: City/Town"
[7] "Please provide your contact information: State/Province" "Please provide your contact information: ZIP/Postal Code"
[9] "Please provide your contact information: Country" "Please provide your contact information: Email Address"
[11] "Please provide your contact information: Phone Number" "I wish it would have snowed more this winter. Response"
Sample data:
structure(list(`Respondent ID` = c(NA, 11385284375, 11385273621,
11385258069, 11385253194, 11385240121, 11385226951, 11385212508
), `Please provide your contact information:` = c("Name", "Benjamin Franklin",
"Mae Jemison", "Carl Sagan", "W. E. B. Du Bois", "Florence Nightingale",
"Galileo Galilei", "Albert Einstein"), ...11 = c("Company", "Poor Richard's",
"NASA", "Smithsonian", "NAACP", "Public Health Co", "NASA", "ThinkTank"
), ...12 = c("Address", NA, NA, NA, NA, NA, NA, NA), ...13 = c("Address 2",
NA, NA, NA, NA, NA, NA, NA), ...14 = c("City/Town", "Philadelphia",
"Decatur", "Washington", "Great Barrington", "Florence", "Pisa",
"Princeton"), ...15 = c("State/Province", "PA", "Alabama", "D.C.",
"MA", "IT", "IT", "NJ"), ...16 = c("ZIP/Postal Code", "19104",
"20104", "33321", "1230", "33225", "12345", "8540"), ...17 = c("Country",
NA, NA, NA, NA, NA, NA, NA), ...18 = c("Email Address", "benjamins#gmail.com",
"mjemison#nasa.gov", "stargazer#gmail.com", "dubois#web.com",
"firstnurse#aol.com", "galileo123#yahoo.com", "imthinking#gmail.com"
), ...19 = c("Phone Number", "215-555-4444", "221-134-4646",
"999-999-4422", "999-000-1234", "123-456-7899", "111-888-9944",
"215-999-8877"), `I wish it would have snowed more this winter.` = c("Response",
"Strongly disagree", "Strongly agree", "Neither agree nor disagree",
"Strongly disagree", "Disagree", "Agree", "Strongly agree")), row.names = c(NA,
-8L), class = c("tbl_df", "tbl", "data.frame"))
How about the following: use read.csv() with header=FALSE. Make two arrays, one with the two lines of headings and one with the answers to the survey. Then paste() the two rows/sentences of together. Finally, use colnames().

Resources