Multiple boxplots with predefined statistics using lattice-like graphs in r - r

I have a dataset which looks like this
VegType 87MIN 87MAX 87Q25 87Q50 87Q75 96MIN 96MAX 96Q25 96Q50 96Q75 00MIN 00MAX 00Q25 00Q50 00Q75
1 0.02 0.32 0.11 0.12 0.13 0.02 0.26 0.08 0.09 0.10 0.02 0.28 0.10 0.11 0.12
2 0.02 0.45 0.12 0.13 0.13 0.02 0.20 0.09 0.10 0.11 0.02 0.26 0.11 0.12 0.12
3 0.02 0.29 0.13 0.14 0.14 0.02 0.27 0.11 0.11 0.12 0.02 0.26 0.12 0.13 0.13
4 0.02 0.41 0.13 0.13 0.14 0.02 0.58 0.10 0.11 0.12 0.02 0.34 0.12 0.13 0.13
5 0.02 0.42 0.12 0.13 0.14 0.02 0.46 0.10 0.11 0.11 0.02 0.28 0.12 0.12 0.13
6 0.02 0.32 0.13 0.14 0.14 0.02 0.52 0.12 0.12 0.13 0.02 0.29 0.13 0.14 0.14
7 0.02 0.55 0.12 0.13 0.14 0.02 0.24 0.10 0.11 0.11 0.02 0.37 0.12 0.12 0.13
8 0.02 0.55 0.12 0.13 0.14 0.02 0.19 0.10 0.11 0.12 0.02 0.22 0.11 0.12 0.13
In reality I have 26 variables and 5 years (87,96 and 00 in the column names are years). In an ideal world I would like to have a lattice-like graph with 26 plots, one per variable, with each plot containing 5 boxes, i.e. one per year. I understand that it is not possible to do this is lattice because lattice won't accept predefined statistics. Is there a fairly unpainful way to do this in R with predefined stats? I have used bxp for simple boxplots plotting all the variables for one year in a single plot e.g.
Yr01 = read.csv('dat.csv',header=T)
dat01=t(Yr01[,c("01Min","01Q25","01Mean","01Q75","01Max")])
bxp(list(stats=dat01, n=rep(26, ncol(dat01))),ylim=c(0.07,0.2))
but I don't know how to go from there to what I need.
Thanks.

This can be done, at least using ggplot2, but you'll have to reshape your data quite a bit. And you really have to have a data where the quantiles actually make sense!! Your quantile values are all messed up! For example, Var1 has 01Max = 0.26 and 01Q75 = .67!!
First, I'll recreate a valid data:
n <- c("01Min", "01Max", "01Med", "01Q25", "01Q75", "02Min",
"02Max", "02Med", "02Q25", "02Q75")
v1 <- c(0.03, 0.76, 0.41, 0.13, 0.67, 0.10, 0.43, 0.27, 0.2, 0.33)
v2 <- c(0.03, 0.28, 0.14, 0.08, 0.20, 0.02, 0.77, 0.13, 0.06, 0.44)
df <- data.frame(v1=v1, v2=v2)
df <- as.data.frame(t(df))
names(df) <- n
df <- cbind(var=c("v1","v2"), df)
> df
# var 01Min 01Max 01Med 01Q25 01Q75 02Min 02Max 02Med 02Q25 02Q75
# v1 v1 0.03 0.76 0.41 0.13 0.67 0.10 0.43 0.27 0.20 0.33
# v2 v2 0.03 0.28 0.14 0.08 0.20 0.02 0.77 0.13 0.06 0.44
Next, we'll reshape the data:
require(reshape2)
df.m <- melt(df, id="var")
# look for a bunch of numbers from the start of the string and capture it
# in the first variable: () captures the pattern. And replace it with the
# captured pattern with the variable "\\1"
df.m$year <- gsub("^([0-9]+)(.*$)", "\\1", df.m$variable)
# the same but instead refer to the captured pattern in the second
# paranthesis using "\\2"
df.m$quan <- gsub("^([0-9]+)(.*)$", "\\2", df.m$variable)
df.f <- dcast(df.m, var+year ~ quan, value.var="value")
To get to this format:
> df.f
# var year Max Med Min Q25 Q75
# 1 v1 01 0.76 0.41 0.03 0.13 0.67
# 2 v1 02 0.43 0.27 0.10 0.20 0.33
# 3 v2 01 0.28 0.14 0.03 0.08 0.20
# 4 v2 02 0.77 0.13 0.02 0.06 0.44
Now, we can plot by directly providing the quantile values to corresponding parameters using the corresponding column names as follows:
require(ggplot2)
require(scales)
p <- ggplot(df.f, aes(x=var, ymin=`Min`, lower=`Q25`, middle=`Med`,
upper=`Q75`, ymax=`Max`))
p <- p + geom_boxplot(aes(fill=year), stat="identity")
p
# if you want facetting:
p + facet_wrap( ~ var, scales="free")
You can now accomplish your task of plotting all years for each var in a separate plot using a lapply with this code and subsetting as follows:
lapply(levels(df.f$var), function(x) {
p <- ggplot(df.f[df.f$var == x, ],
aes(x=var, ymin=`Min`, lower=`Q25`,
middle=`Med`, upper=`Q75`, ymax=`Max`))
p <- p + geom_boxplot(aes(fill=year), stat="identity")
p
ggsave(paste0(x, ".pdf"), last_plot())
})
Edit: Your data is different from the earlier data you provided in some aspects. So, here's the version of the code for your new data:
# change var to VegType everywhere
require(reshape2)
df.m <- melt(df, id="VegType")
df.m$year <- gsub("^X([0-9]+)(.*$)", "\\1", df.m$variable) # pattern has a X
df.m$quan <- gsub("^X([0-9]+)(.*)$", "\\2", df.m$variable) # pattern has a X
df.f <- dcast(df.m, VegType+year ~ quan, value.var="value")
df.f$VegType <- factor(df.f$VegType) # convert integer to factor
require(ggplot2)
require(scales)
p <- ggplot(df.f, aes(x=VegType, ymin=`MIN`, lower=`Q25`, middle=`Q50`,
upper=`Q75`, ymax=`MAX`))
p <- p + geom_boxplot(aes(fill=year), stat="identity")
p
You can facet/write as separate plots using same code as before.

Related

Include facet_wrap in R line plot

I have the following code to plot a line graph:
df %>% pivot_longer(-Client) %>%
ggplot(aes(x=name,y=value,color=factor(Client),group=factor(Client)))+
geom_line()+
xlab('Client')+
theme_bw()+
labs(color='Client')
It plots a line for each of my clients, but since i have too many clients, plot all of them in one graph gets pretty messy, I've been tryin to use the facet_wrap() function to divide the clients in separate graphs but couldn't figure out how to do this, so here I am...
There is a sample of my data:
Client Model_1 Model_2 Model_3 Model_4 Model_5
1 10.34 0.22 0.62 0.47 1.96
2 0.97 0.60 0.04 0.78 0.19
3 2.01 0.15 0.27 0.49 0.00
4 0.57 0.94 0.11 0.66 0.00
5 0.68 0.65 0.26 0.41 0.50
6 0.55 3.59 0.06 0.01 5.50
7 10.68 1.08 0.07 0.16 0.20
Try creating a group over number of customer using module like this:
library(ggplot2)
library(dplyr)
library(tidyr)
#Code
df %>% pivot_longer(-Client) %>%
mutate(Group=ifelse(Client %% 2==0,'G1','G2')) %>%
ggplot(aes(x=name,y=value,color=factor(Client),group=factor(Client)))+
geom_line()+
xlab('Client')+
theme_bw()+
labs(color='Client')+
facet_wrap(.~Group,scales = 'free')
Output:

'x' and 'y' lengths differ in custom entropy function

I am trying to learn R and I am having problems with the way it works. I tried to make an entropy function of variables p and 1-p from scratch and I am having problems when I try to add some ifs to avoid the NaN when dividing by 0.
When I try the custom entropy with the plot, it just works but it shows the NaN when I print the results. But when I try to add the ifs, then it says:
Error in xy.coords(x, y, xlabel, ylabel, log) :
'x' and 'y' lengths differ
entropy <- function(p){
cat("p = " , p)
if (p==0 || p==1) {
result = 0
}else{
result = - p*log2(p)-(1-p)*log2((1-p))
}
cat("\nresult=",result)
return(result)
}
p <- seq(0,1,0.01)
plot(p, entropy(p), type='l', main='Funcion entropia con dos valores posibles')
I don't understand it since I am using a plot of an array as x and a function with that array as parameter as y, so it should be the same lengths with and without ifs.
Console without the ifs:
p = 0 0.01 0.02 0.03 0.04 0.05 0.06 0.07 0.08 0.09 0.1 0.11 0.12 0.13 0.14 0.15 0.16 0.17 0.18 0.19 0.2 0.21 0.22 0.23 0.24 0.25 0.26 0.27 0.28 0.29 0.3 0.31 0.32 0.33 0.34 0.35 0.36 0.37 0.38 0.39 0.4 0.41 0.42 0.43 0.44 0.45 0.46 0.47 0.48 0.49 0.5 0.51 0.52 0.53 0.54 0.55 0.56 0.57 0.58 0.59 0.6 0.61 0.62 0.63 0.64 0.65 0.66 0.67 0.68 0.69 0.7 0.71 0.72 0.73 0.74 0.75 0.76 0.77 0.78 0.79 0.8 0.81 0.82 0.83 0.84 0.85 0.86 0.87 0.88 0.89 0.9 0.91 0.92 0.93 0.94 0.95 0.96 0.97 0.98 0.99 1
result= NaN 0.08079314 0.1414405 0.1943919 0.2422922 0.286397 0.3274449 0.3659237 0.4021792 0.4364698 0.4689956 0.499916 0.5293609 0.5574382 0.5842388 0.6098403 0.6343096 0.6577048 0.680077 0.7014715 0.7219281 0.7414827 0.7601675 0.7780113 0.7950403 0.8112781 0.8267464 0.8414646 0.8554508 0.8687212 0.8812909 0.8931735 0.9043815 0.9149264 0.9248187 0.9340681 0.9426832 0.9506721 0.958042 0.9647995 0.9709506 0.9765005 0.9814539 0.985815 0.9895875 0.9927745 0.9953784 0.9974016 0.9988455 0.9997114 1 0.9997114 0.9988455 0.9974016 0.9953784 0.9927745 0.9895875 0.985815 0.9814539 0.9765005 0.9709506 0.9647995 0.958042 0.9506721 0.9426832 0.9340681 0.9248187 0.9149264 0.9043815 0.8931735 0.8812909 0.8687212 0.8554508 0.8414646 0.8267464 0.8112781 0.7950403 0.7780113 0.7601675 0.7414827 0.7219281 0.7014715 0.680077 0.6577048 0.6343096 0.6098403 0.5842388 0.5574382 0.5293609 0.499916 0.4689956 0.4364698 0.4021792 0.3659237 0.3274449 0.286397 0.2422922 0.1943919 0.1414405 0.08079314 NaN
Console with the ifs:
p = 0 0.01 0.02 0.03 0.04 0.05 0.06 0.07 0.08 0.09 0.1 0.11 0.12 0.13 0.14 0.15 0.16 0.17 0.18 0.19 0.2 0.21 0.22 0.23 0.24 0.25 0.26 0.27 0.28 0.29 0.3 0.31 0.32 0.33 0.34 0.35 0.36 0.37 0.38 0.39 0.4 0.41 0.42 0.43 0.44 0.45 0.46 0.47 0.48 0.49 0.5 0.51 0.52 0.53 0.54 0.55 0.56 0.57 0.58 0.59 0.6 0.61 0.62 0.63 0.64 0.65 0.66 0.67 0.68 0.69 0.7 0.71 0.72 0.73 0.74 0.75 0.76 0.77 0.78 0.79 0.8 0.81 0.82 0.83 0.84 0.85 0.86 0.87 0.88 0.89 0.9 0.91 0.92 0.93 0.94 0.95 0.96 0.97 0.98 0.99 1
result= 0Error in xy.coords(x, y, xlabel, ylabel, log) :
'x' and 'y' lengths differ
You did not create a vector but a scalar since you did not used a vectorized functionality in you if else clause. The result of your function has been just one number.
This should work:
entropy <- function(p){
# initialize a vector of the desired length with zeros
result <- numeric(length(p))
# subset the vector for which you want to apply your formula on
x <- p[!(p %in% c(0,1))]
# overwrite only those positions for which you want to calculate values based
# on your formula
result[!(p %in% c(0,1))] <- - x*log2(x)-(1-x)*log2((1-x))
#cat("\nresult=",result)
return(result)
}
p <- seq(0,1,0.01)
plot(p, entropy(p), type='l', main='Funcion entropia con dos valores posibles')
EDIT:
Even tho I was suggested to do it vectorizing it, I wanted to do it somewhat similar to other languages I know for the moment, since I am starting. I was able to fix it, althought I ended up using a for and printing 2 arrays instead of the function itself.
entropy <- function(p){
if (p==0 || p==1) {
result = 0
}else{
result = - p*log2(p)-(1-p)*log2((1-p))
}
return(result)
}
x <- seq(0,1,0.01)
y <- numeric(length(p))
i = 1
for (p in x) {
y[i] = entropy(p)
cat(x[i],"=",y[i],"\n")
i=i+1
}
plot(x, y, type='l', main='Funcion entropia con dos valores posibles')
I just applied your entropy function to the p vector prior to trying to plot it using the sapply function.
entropy <- function(p){
cat("p = " , p)
if (p==0 || p==1) {
result = 0
}else{
result = - p*log2(p)-(1-p)*log2((1-p))
}
cat("\nresult=",result)
return(result)
}
p <- seq(0,1,0.01)
# Apply the function over all the values of 'p'
entropy_p <- sapply(p,FUN = entropy)
plot(p, entropy_p, type='l', main='Funcion entropia con dos valores posibles')

Logaritmic scale in x-axis

I have the following code:
S = [100 200 500 1000 10000];
H = [0.14 0.15 0.17 0.19 0.28;0.14 0.16 0.18 0.20 0.29;0.15 0.17 0.19 0.21 0.31;0.16 0.17 0.20 0.22 0.32;0.23 0.22 0.28 0.30 0.44;0.23 0.23 0.29 0.3 0.5;0.33 0.32 0.4 0.42 0.63;0.32 0.31 0.39 0.40 0.61;0.23 0.23 0.30 0.30 0.50];
for i = 1:9
hold on
plot(S, H(i,:));
legend('GHM01','GHM02','GHM03','GHM04','GHM05','GHM06','GHM07','GHM08','GHM09'); %legend not correctly
axis([100 10000 0.1 1])
end
set(gca,'xscale','log')
The x-axis looks like this:
Because The S-values are very far from each other, I used a logaritmic x-axis (and linear y-axis).
I have on the axis 5 values (see S), and I only want those 5 values visible on the x-axis with equidistant spacing between the values. How do I do this? Or is there a better alternative to display my x-axis, rather than logaritmic scale?
If you want the X-axis ticks to be equally distant although they are not (neither on a linear nor on a log scale) then you basically treat this axis as categorical, and then it should get and ordinal temporary value (say 1:5) to determine the distance between them.
Here is a quick implementation of your comment above:
S = {'100' '200' '500' '1000' '10000'};
H = [0.14 0.15 0.17 0.19 0.28;...
0.14 0.16 0.18 0.20 0.29;
0.15 0.17 0.19 0.21 0.31;
0.16 0.17 0.20 0.22 0.32;
0.23 0.22 0.28 0.30 0.44;
0.23 0.23 0.29 0.3 0.5;
0.33 0.32 0.4 0.42 0.63;
0.32 0.31 0.39 0.40 0.61;
0.23 0.23 0.30 0.30 0.50];
f = figure;
plot(1:length(S),H);
f.Children.XTick = 1:length(S);
f.Children.XTickLabel = S;
TMHO this is the most straightforward way to solve this problem ;)

Error using corrplot

I need help with interpreting an error message using corrplot.
Here is my script
install.packages("ggplot2")
install.packages("corrplot")
install.packages("xlsx")
library(ggplot2)
library(corrplot)
library(xlsx)
#set working dir
setwd("C:/R")
#read xlsx data into R
df <- read.xlsx("TP_diff_frame.xlsx",1)
#set column as index
rownames(df) <- df$country
#remove column
df2<-subset(df, select = -c(country) )
#round values to to decimals
corrplot(df2, method="shade",shade.col=NA, tl.col="black", tl.srt=45)
My df2:
> df2
a b c d e f g
Sweden 0.09 0.19 0.00 -0.25 -0.04 0.01 0.00
Germany 0.11 0.19 0.01 -0.35 0.01 0.02 0.01
UnitedKingdom 0.14 0.21 0.03 -0.32 -0.05 0.00 0.00
RussianFederation 0.30 0.26 -0.07 -0.41 -0.09 0.00 0.00
Netherlands 0.09 0.16 -0.05 -0.26 0.02 0.02 0.01
Belgium 0.12 0.20 0.01 -0.34 0.01 0.00 0.00
Italy 0.14 0.22 0.01 -0.37 0.00 0.00 0.00
France 0.14 0.24 -0.04 -0.34 0.00 0.00 0.00
Finland 0.16 0.17 0.01 -0.26 -0.08 0.00 0.00
Norway 0.15 0.21 0.10 -0.37 -0.09 0.00 0.00
And the error message:
> corrplot(df2, method="shade",shade.col=NA, tl.col="black", tl.srt=45)
Error in matrix(unlist(value, recursive = FALSE, use.names = FALSE), nrow = nr, :
length of 'dimnames' [2] not equal to array extent
I think the problem is that you are plotting the data frame instead of the correlation matrix. Try to change the last line to this:
corrplot(cor(df2), method="shade",shade.col=NA, tl.col="black", tl.srt=45)
The function cor calculates the correlation matrix, which is what you need to plot
In order to use the corrplot package for heatmap plots you should pass your data.frame to a matrix and also use the is.corr argument.
df2 <- as.matrix(df2)
corrplot(df2, is.corr=FALSE)
Another option is to break it up into two lines of code.
df2 <- cor(df, use = "na.or.complete")
corrplot(df2, method="shade",shade.col=NA, tl.col="black", tl.srt=45)
I'd run a simple corrplot (e.g. corrplot.mixed(df2)) make sure it works, then get into the fine tuning and aesthetics.

Colorize/highlight values of R ftable() output in knitr/Sweave rapports

I am generating a lot of ftable() crosstabulations for a descriptive report. Example:
AUS BEL BUL EST FRA GEO GER HUN ITA NET NOR ROM RUS
30- primary 0.06 0.03 0.07 0.03 0.02 0.03 0.03 0.02 0.05 0.03 0.05 0.04 0.02
secondary 0.30 0.09 0.16 0.10 0.10 0.14 0.10 0.16 0.11 0.08 0.08 0.09 0.11
tertiary 0.05 0.07 0.04 0.05 0.07 0.06 0.02 0.04 0.02 0.05 0.06 0.02 0.09
30+ primary 0.07 0.16 0.12 0.07 0.16 0.03 0.05 0.11 0.35 0.21 0.09 0.17 0.03
secondary 0.40 0.20 0.30 0.29 0.25 0.35 0.35 0.34 0.27 0.20 0.27 0.34 0.26
tertiary 0.13 0.23 0.13 0.18 0.17 0.17 0.18 0.09 0.09 0.23 0.23 0.06 0.24
60+ primary 0.00 0.12 0.10 0.13 0.14 0.07 0.05 0.12 0.09 0.11 0.06 0.19 0.12
secondary 0.00 0.05 0.05 0.08 0.06 0.10 0.14 0.09 0.02 0.04 0.11 0.07 0.06
tertiary 0.00 0.05 0.03 0.06 0.03 0.04 0.07 0.03 0.01 0.05 0.06 0.02 0.07
I am looking for a function that could take the ftable() or table() output, and highligh values that deviate from the row-mean, or assign an overall gradient to the text of the values, e.g. from 0-100% the values are coloured from red to green.
The output is now processed through knitr, but I'm not sure at which point in the toolchain I could intervene and add colour based on the relative size of the values.
You can use the latex function, in the Hmisc package.
# Example shamelessly copied from http://www.karlin.mff.cuni.cz/~kulich/vyuka/Rdoc/harrell-R-latex.pdf
cat('
\\documentclass{article}
\\usepackage[table]{xcolor}
\\begin{document}
<<results=tex>>=
library(Hmisc)
d <- head(iris)
cellTex <- matrix(rep("", nrow(d) * ncol(d)), nrow=nrow(d))
cellTex[2,2] <- "cellcolor{red}"
cellTex[2,3] <- "color{red}"
cellTex[5,1] <- "rowcolor{yellow}"
latex(d, file = "", cellTexCmds = cellTex, rowname=NULL)
#
\\end{document}',
file="tmp.Rnw" )
Sweave("tmp.Rnw")
library(utils)
texi2pdf("tmp.tex")
To generate latex tables from R objects, you can use the xtable package. It is available on CRAN, take a look at the documentation. To get the color in the table, use the color latex package. Some example code:
library(xtable)
n = 100
cat_country = c("NL","BE","HU")
cat_prim = c("primary","secondary","tertiary")
dat = data.frame(country = sample(cat_country, n, replace = TRUE),
prim = sample(cat_prim, n, replace = TRUE))
ftable_dat = ftable(dat)
## Make latex table:
latex_table = xtable(as.table(ftable_dat))
To get what you want I made the following hack (ugly one). The trick is to print the xtable object and than edit that:
latex_table = within(latex_table, {
# browser()
primary = ifelse(primary > 12, sprintf("\\textbf{%s}", primary), primary)
#primary = sub("\\{", "{", primary)
})
printed_table = print(latex_table)
printed_table = sub("backslash", "\\", printed_table)
printed_table = sub("\\\\}", "}", printed_table)
printed_table = sub("\\\\\\{", "{", printed_table)
printed_table = sub("\\$", "\\", printed_table)
printed_table = sub("\\$", "\\", printed_table)
cat(printed_table)
Which leads to:
% latex table generated in R 2.14.1 by xtable 1.6-0 package
% Thu Feb 16 13:10:55 2012
\begin{table}[ht]
\begin{center}
\begin{tabular}{rrrr}
\hline
& primary & secondary & tertiary \\
\hline
BE & 10 & 5 & 11 \\
HU & \textbf{13} & 13 & 8 \\
NL & 11 & 17 & 12 \\
\hline
\end{tabular}
    \end{center}
    \end{table}
This example makes a number in the primary category bold, but it can work for colorization just as easily. Maybe someone else has a more elegant solution?

Resources