R: Geom_histrogram - r

I am trying to create a histogram using geom_histogram() that uses a numeric variable for both the x and y axis.
The numeric x axis will be bucketed and the numeric x axis will show the sum of some other numeric value for each bucket. Right now, I am not having any luck and was hoping someone could help.
attach(Pre_vitality_HZ_Data)
buckets_pre = seq(min(Pre_V_HzR),max(Pre_V_HzR)+1,0.05)
ggplot() +
geom_histogram(alpha = 0.2, aes(x=Pre_V_HzR, y = sum(Policy_Count)), bins = length(buckets), fill = 'aquamarine3')
`

To make the plot you want with ggplot2, it's necessary to prepare the data before plotting. In the solution below, I propose dividing the continuous x-variable into a discrete variable with cut(), and using aggregate() to sum the y-values for each bin of x-values. Besides the base R function aggregate, there are many ways to summarize, aggregate and reshape your data. You may wish to look into the dplyr package or data.table package (two very powerful, well supported packages).
library(ggplot2)
# Use the built-in data set `mtcars` to make the example reproducible.
# Run ?mtcars to see a description of the data set.
head(mtcars)
# mpg cyl disp hp drat wt qsec vs am gear carb
# Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
# Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
# Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
# Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
# Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
# Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
# Let's use `disp` (engine displacement) as the x-variable
# and `mpg` (miles per gallon) as the y-variable.
# Bin the `disp` column into discrete variable with `cut()`
disp_bin_edges = seq(from=71, to=472, length.out=21)
mtcars$disp_discrete = cut(mtcars$disp, breaks=disp_bin_edges)
# Use `aggregate()` to sum `mpg` over levels of `disp_discrete`,
# creating a new data.frame.
dat = aggregate(mpg ~ disp_discrete, data=mtcars, FUN=sum)
# Use `geom_bar(stat="identity") to plot pre-computed y-values.
p1 = ggplot(dat, aes(x=disp_discrete, y=mpg)) +
geom_bar(stat="identity") +
scale_x_discrete(drop=FALSE) +
theme(axis.text.x=element_text(angle=90)) +
ylab("Sum of miles per gallon") +
xlab("Displacement, binned")
# For this example data, a scatterplot conveys a clearer story.
p2 = ggplot(mtcars, aes(x=disp, y=mpg)) +
geom_point(size=5, alpha=0.4) +
ylab("Miles per gallon") +
xlab("Displacement")
library(gridExtra)
ggsave("plots.png", arrangeGrob(p1, p2, nrow=1), height=4, width=8, dpi=150)

Related

How to use a short script to eliminate all but one duplicate column variables based on the prefix of the colname

I want to know to use a short script to eliminate all but one duplicate column variables based on the prefix of the colname without inputting the variables I want to remove by hand.
For example, I created repeats of the mtcars$am variables, called am1, am2, am3, and am4 in a data frame called mtcars_example_2. I removed the original am variable in the mtcars_example_2 data frame.
I can use the script below to eliminate all variables with the prefix "am" but the am1 variable into a new variable called mtcars_example_3 using the code below, which inputs all variables to remove by hand:
## long way of removing all variable with am prefix that were not am1
mtcars_example_3 <-
mtcars_example_2 %>%
select(
-c(
"am2", "am3", "am4"
)
)
But this seems like the long way of doing this. Is there a faster way that does not require me to individual type in the names of each of the variables that I want to remove from the data.
Is this possible? If so, how can this be done?
Thanks ahead of time.
Here is the code for the example:
# example data
## loads packages
library(tidyverse)
## creates mtcars_example data
mtcars_example_1 <- data.frame(mtcars)
mtcars_example_2 <- data.frame(mtcars_example_1)
## creates duplicate variables, based on am variable
mtcars_example_2$am1 <- mtcars_example_1$am
mtcars_example_2$am2 <- mtcars_example_1$am
mtcars_example_2$am3 <- mtcars_example_1$am
mtcars_example_2$am4 <- mtcars_example_1$am
## removes original variable
mtcars_example_2 <-
mtcars_example_2 %>%
select(
-c(
"am"
)
)
## long way of removing all variable with am prefix that were not am1
mtcars_example_3 <-
mtcars_example_2 %>%
select(
-c(
"am2", "am3", "am4"
)
)
You can remove all the variables that start with am but keep am1 :
library(dplyr)
mtcars_example_2 %>% select(-starts_with('am'), am1) %>% head
# mpg cyl disp hp drat wt qsec vs gear carb am1
#Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 4 4 1
#Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 4 4 1
#Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 4 1 1
#Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 3 1 0
#Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 3 2 0
#Valiant 18.1 6 225 105 2.76 3.460 20.22 1 3 1 0
Depending on your actual scenario you can also use regex to remove columns.
mtcars_example_2 %>% select(-matches('am[2-4]')) %>% head
We could also do
library(dplyr)
mtcars_example_2 %>%
select(-contains('am'), am1)

Multiple plots in R on a pdf with a loop

I think I need help with the loop. How would you do multiple plots on separate pdf pages with the data below:
pdf page 1:
Mazda RX4
2 panel plot for mpg vs cyl and mpg vs vs
pdf page 2:
Hornet 4 D
2 panel plot for mpg vs cyl and mpg vs vs
and the same for Valiant.
model mpg cyl vs
Mazda RX4 21.0 6 0
Mazda RX4 21.0 6 0
Mazda RX4 22.8 4 1
Hornet 4 D 21.4 6 1
Hornet 4 D 18.7 8 0
Valiant 18.1 6 1
Valiant 21.4 6 1
Valiant 21.0 6 0
Valiant 22.8 6 0
Thanks.
What I do in this case is set up the plots I want on one page with gridExtra, save that as a PDF, and then concatenate all these PDFs with ghostscript.
In R:
library(gridExtra)
library(ggplot2)
plot_one <- ggplot() + geom_...
plot_two <- ggplot() + geom_...
# Arrange the two plots one per row.
# grid.arrange'd plots can be nested, too!
two_rows <- grid.arrange(plot_one, plot_two, nrow = 2)
ggsave("dataset_1.pdf", two_rows)
# repeat for second, third, etc datasets so you end up with dataset_2.pdf etc
These are then concatenated to one PDF with multiple pages with ghostscript:
gs -sDEVICE=pdfwrite \
-dNOPAUSE \
-dQUIET \
-dBATCH \
-sOutputFile=multipage.pdf \
dataset_1.pdf dataset_2.pdf
Derived from an example elsewhere (https://www.researchgate.net/post/How_to_save_the_graphics_in_several_separate_pages_with_R)
# Create pdf
pdf(...)
# Create different plots
plot1(...)
plot2(...)
plot3(...)
dev.off()
Note: set the parameter onefile=FALSE in pdf()

Predicting data via regression model and storing in a vector

Apologies for what is probably a very basic question.
I have created a linear model for a massive meteorological dataset using multiple regression. My goal is to use that model to "predict" data during a certain period using predictors 1, 2 and 3. I will then compare those predicted data to the observed data for that period.
My approach thus far has been to create a new vector for the predicted values and loop through the vector, creating predicted values based on the extracted coefficients of the linear model. Then, I will simply subtract the predicted values from the observed values. For some reason, this approach results in the new predicted vector being NULL. Any idea how I could approach this?
A sample is below. "data" refers to the dataset containing the predictors.
coef <- coefficients(multipleRegressionModel)
predictedValues=c()
for(i in 1:length(data$timePeriod)){
predictedValues[i] = append(predictedValues, data$coef[1]+data$predictor1[i]*data$coef[2]+data$predictor2[i]*data$coef[3]+
data$predictor3[i]*data$coef[4])
}
diff=c()
diff=observedValues - predictedValues
It looks like you are making this more difficult than it needs to be. R has a predict() function that does all of this for you. If you had a sample data.frame like so:
set.seed(26)
mydf = data.frame (a=1:20 , b = rnorm(20),
c = 1:20 + runif(20,2,3)*runif(20, 2, 3),
d = 1:20 + rpois(20,5)*runif(1:20)*sin(1:20))
And you wanted to train on some rows, and test on the others
trainRows<-sample(1:20, 16)
mydf.train<-mydf[trainRows,]
mydf.test<-mydf[-trainRows,]
Then fit the model and predict
model<-lm(a~b+c+d, data = mydf.train)
summary(model) #gives info about your model.
mydf.test$pred<-predict(model1, newdata = mydf.test)
MSE<-mean((mydf.test$pred-mydf.test$a)^2) #calculate mean squared error
MSE
#[1] 0.06321
View the predictions with mydf.test$pred
Here is a simple example using a glm on the mtcars data.
Line<- #setting up the linear model function
function (train_dat, test_dat, variables, y_var, family = "gaussian")
{
fm <- as.formula(paste(y_var, " ~", paste(variables, collapse = "+"))) #formula
glm1 <- glm(fm, data = train_dat, family = family) #run the model
pred <- predict(glm1, newdata = test_dat) #predict the model
}
data(mtcars)
y_var<-'mpg'
x_vars<-setdiff(names(mtcars),y_var)
mtcars[,'linear_prediction']<-Line(mtcars,mtcars,x_vars,y_var)
head(mtcars)
mpg cyl disp hp drat wt qsec vs am gear carb linear_prediction
Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 22.59951
Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 22.11189
Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 26.25064
Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 21.23740
Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 17.69343
Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 20.38304

geom_point (or qplot), color the points with a third variable-discrete, without converting to factors

My story:
I need to plot a 2-D dimensional plot with the points colored by a third variable which is discrete and valued as integers (20 possible values).
Finding so far:
All the code that I find first converts the third variable to factors and then color the points with the factor levels. For instance,
p <- qplot(mpg, wt, data = mtcars, colour = factor(cyl))
p + scale_colour_manual(values = c("red","blue", "green"))
Question:
I am confused from here because I am not sure what the actual value (of my original third variable) is corresponding to each level of the factors. Are the values in the legend the actual values or the levels?
Is there another way to do it without converting my variable to a factor variable?
If you look at the output from mtcars, you see that the values of the cyl variable (before converting to a factor) are 4, 6, and 8.
> mtcars
mpg cyl disp hp drat wt
Mazda RX4 21.0 6 160.0 110 3.90 2.620
Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875
Datsun 710 22.8 4 108.0 93 3.85 2.320
Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215
Hornet Sportabout 18.7 8 360.0 175 3.15 3.440
...
When you convert mtcars$cyl variable to a factor it uses the original values as the labels:
> mtcars$cyl <- as.factor(mtcars$cyl)
> str(mtcars$cyl)
Factor w/ 3 levels "4","6","8": 2 2 1 2 3 2 3 1 1 2 ...
So the graph example you created above is outputting the new factor labels and these correspond directly to your original values. That is to say, it should be safe to convert your discrete variable with 20 levels to a factor and use the new factor variable to colour your graph, the labels will be correct.
If you don't want to change your 20-level discrete variable you can always graph it as a continuous variable but I don't think the resulting legend is the type of legend you're after.
qplot(mpg, wt, data = mtcars, colour = cyl)

Subsets of a dataset as separate dendrograms, but in the same plot

I know I can plot a dendrogram as follows
library(cluster)
d <- mtcars
d[,8:11] <- lapply(d[,8:11], as.factor)
gdist <- daisy(d, metric = c("gower"), stand = FALSE)
dendro <- hclust(gdist, method = "average")
plot(as.dendrogram(dendro))
However I have some groups identified (eg. by an iterative classification method), given as the last column in d
G <- c(1,2,3,3,4,4,5,5,5,5,1,2,1,1,2,4,1,3,4,5,1,7,4,3,3,2,1,1,1,3,5,6)
d$Group <- G
head(d)
mpg cyl disp hp drat wt qsec vs am gear carb Group
Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 1
Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 2
Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 3
Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 3
Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 4
Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 4
I am trying to plot all the dendrograms together on the same plot with the same scale. The groups with only a single member also needs to be plotted. (group 6 and 7)
I am able to plot individual dendrograms for subset of the data except when number of members in a group is only one. But I don't think this is the right approach.
layout(matrix(1:9, 3,3,byrow=TRUE))
gdist <- as.matrix(gdist)
N <- max(G)
for (i in 1:N){
rc_tokeep <- row.names(subset(d, G==i))
dis <- as.dist(gdist[rc_tokeep, rc_tokeep])
dend <- hclust(dis, method = "average")
plot(as.dendrogram(dend))
}
The loop is giving this error for the last two groups. (6 and 7) having only a single member.
Error in hclust(dis, method = "average") :
must have n >= 2 objects to cluster
Essentially I wan't to reproduce these type of plots. The clusters with single members are also plotted here.
If you want to mimic the last few graphs, you can do something like this:
N <- max(G)
layout(matrix(c(0,1:N,0),nc=1))
gdist <- as.matrix(gdist)
for (i in 1:N){
par(mar=c(0,3,0,7))
rc_tokeep <- row.names(subset(d, G==i))
if(length(rc_tokeep)>2){ #The idea is to catch the groups with one single element to plot them differently
dis <- as.dist(gdist[rc_tokeep, rc_tokeep])
dend <- hclust(dis, method = "average")
plot(as.dendrogram(dend),horiz=TRUE,
xlim=c(.8,0),axes=FALSE) # giving the same xlim will scale all of them, here i used 0.8 to fit your data but you can change it to whatever
}else{
plot(NA,xlim=c(.8,0),ylim=c(0,1),axes=F,ann=F)
segments(0,.5,.1,.5) #I don't know how you intend to compute the length of the branch in a group of 1 element, you might want to change that
text(0,.5, pos=4,rc_tokeep,xpd=TRUE)
}
}
With your example it gives:
If you want to add the scale you can add a grid in all graphs and a scale in the last one:
N <- max(G)
layout(matrix(c(0,1:N,0),nc=1))
gdist <- as.matrix(gdist)
for (i in 1:N){
par(mar=c(0,3,0,7))
rc_tokeep <- row.names(subset(d, G==i))
if(length(rc_tokeep)>2){
dis <- as.dist(gdist[rc_tokeep, rc_tokeep])
dend <- hclust(dis, method = "average")
plot(as.dendrogram(dend),horiz=TRUE,xlim=c(.8,0),xaxt="n",yaxt="n")
abline(v=seq(0,.8,.1),lty=3) #Here the grid
}else{
plot(NA,xlim=c(.8,0),ylim=c(0,1),axes=F,ann=F)
segments(0,.5,.1,.5)
text(0,.5, pos=4,rc_tokeep,xpd=TRUE)
abline(v=seq(0,.8,.1),lty=3) #Here the grid
}
}
axis(1,at=seq(0,.8,.1)) #Here the axis
And finally if you want to even the spaces between the different branches in the resulting plot, you can use table(d$Group) to get the number of members of each group and use it as a height for each subplot:
N <- max(G)
layout(matrix(c(0,1:7,0),nc=1), height=c(3,table(d$Group),3)) #Plus the height of the empty spaces.
gdist <- as.matrix(gdist)
for (i in 1:N){
par(mar=c(0,3,0,7))
rc_tokeep <- row.names(subset(d, G==i))
if(length(rc_tokeep)>2){
dis <- as.dist(gdist[rc_tokeep, rc_tokeep])
dend <- hclust(dis, method = "average")
plot(as.dendrogram(dend),horiz=TRUE,xlim=c(.8,0),xaxt="n",yaxt="n")
abline(v=seq(0,.8,.1),lty=3)
}else{
plot(NA,xlim=c(.8,0),ylim=c(0,1),axes=F,ann=F)
segments(0,.5,.1,.5)
text(0,.5, pos=4,rc_tokeep,xpd=TRUE)
abline(v=seq(0,.8,.1),lty=3)
}
}
axis(1,at=seq(0,.8,.1))

Resources