How to clean my data and create a graph with ggplot2? - r

So I want to create a graph using data from Wikipedia, I created a data frame out of table that I have found. It contains two columns - style of beer and range of bitternes (IBU) like "20-50". Both are character, so I can't make a graph out of it that makes sense. I managed to change IBU column to two separate ones that are both numeric (min and max) but it created second data frame inside my first data frame, tried to find similar case but I couldn't, I'm now stuck and don't know what to do next :(
Sorry in advance for pasting so much code, I just want someone to read the data and see it's structure.
library(xml2)
library(rvest)
library(ggplot2)
library(tidyr)
file_html <- read_html(
"https://pl.wikipedia.org/wiki/International_Bittering_Units",
encoding = "UTF-8")
table_html <- html_node(file_html, "#mw-content-text > div > table")
table_IBU <- html_table(table_html, fill = TRUE)
table_IBU$IBU2 <- str_replace(table_IBU$`Stopień IBU`, "\\+", "")
table_IBU$IBU3 <- tidyr::separate(table_IBU, IBU2, into = c("min", "max"), sep = " – ")
table_IBU <- subset(table_IBU, select = -c(IBU2,
`Stopień IBU`,
`Gatunek piwa`))
table_IBU$IBU3$min2 <- as.numeric(table_IBU$IBU3$min)
table_IBU$IBU3$max2 <- as.numeric(table_IBU$IBU3$max)
#graph that I can come up with on my own
IBUgraph <- ggplot(table_IBU$IBU3, aes(reorder(`Gatunek piwa`, + max2),
max2)) +
geom_point(width = 0.5, color = "darkolivegreen",
fill = "darkseagreen4") +
theme(text=element_text(size = 9))
IBUgraph = IBUgraph +
labs(y = "Międzynarodowe Jednostki Goryczy (IBU)",
x = "Gatunek",
title = "Skala IBU - International Bitterness Units,
czyli międzynarodowe jednostki goryczy")
IBUgraph <- IBUgraph + theme(axis.text.x=element_text(angle=45, hjust=1.1))
IBUgraph
In the end I want to create a graph using ggplot() showcasing style of beer on x axis, and two points for each style showcasing minimum vaule, maximum value.

You can do this for example, it's called a dumbbell chart
ggplot(table_IBU$IBU3,aes(x=`Gatunek piwa`)) +
geom_point(aes(y=min2)) + # add point for min
geom_point(aes(y=max2)) + # add point for max
geom_segment(aes(xend=`Gatunek piwa`,y=min2,yend=max2)) + # create segment between min and max
theme(axis.text.x = element_text(angle = 90, hjust = 1)) # rotate x axis

So, are you looking for something like this?
library(dplyr)
library(stringr)
library(tidyr)
library(ggplot2)
library(rvest)
#Acquire table
table_IBU <- read_html("https://pl.wikipedia.org/wiki/International_Bittering_Units", encoding = "UTF-8") %>%
html_node(., "#mw-content-text > div > table") %>%
html_table(., fill = TRUE)
#Extract scores into min and max values
table_IBU$IBU2 <- str_replace(table_IBU$`Stopień IBU`, "\\+", "")
table_IBU %<>% separate(., IBU2, into = c("min", "max"), sep = " – ") %>% select(-c(`Stopień IBU`))
table_IBU$min <- as.integer(table_IBU$min)
table_IBU$max <- as.integer(table_IBU$max)
table_IBU %<>% gather(data = ., key = "Limit", value = "Value", min, max)
#Plot
table_IBU %>% ggplot(data = ., aes(x = `Gatunek piwa`)) +
geom_point(aes(y = Value, col = Limit)) +
xlab("Type of beer") +
ylab("Score (0-120)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Quite an odd way to display this data.

Related

How can I create multiple plots from same dataset in R?

Let me first share a dummy data, from which I want to prepare ggplot graphs.
library(tidyverse)
set.seed(1)
sample_size <- 1200
dates <- sample(seq(1,31),sample_size,replace = TRUE)
Monthss <- sample(seq(1,12),sample_size,replace = TRUE)
hrs <- sample(seq(1,23),sample_size,replace = TRUE)
minutes <- sample(seq(1,59),sample_size,replace = TRUE)
date_time_vector <- paste0(dates,"-",Monthss,"-",2022," ",hrs,":",minutes) |> lubridate::parse_date_time("dmy HM")
Conversion <- sample(c(TRUE,FALSE),sample_size, prob = c(0.25,0.75), replace = TRUE)
df <- data.frame(Date = date_time_vector, Conversion_Status = Conversion)
df <- df |> mutate(Leads = round(runif(sample_size, min = 0,max = 10),digits = 0))
df <- df[complete.cases(df), ]
The code above gives me a data.frame with columns Date, Leads and Conversion_Status. I want to prepare Monthly column chart of total leads per day. (For example, daily leads in January, daily leads in February, etc.) So, basically, I will need to split the data on the basis of Month, and prepare one chart for each month. How can I prepare such charts?
I have tried following way:
bar_function <- function(df, col1, col2, title) {
df %>%
ggplot2::ggplot(aes(x = {{col1}}, y = {{col2}})) +
ggplot2::geom_col(fill = "steelblue") +
theme(plot.background = element_rect(fill = "white")) +theme(plot.title = element_text(hjust = 0.5))+coord_flip() +
ggplot2::labs(title = title)
}
mycharts <- df |> dplyr::nest_by(Month) |> dplyr::mutate(plot = bar_function(df,Date,Leads,"Daily Leads by Month"))
But it is giving me errors.
You can split according to month(year) and plot that.
library(ggplot2)
library(lubridate)
set.seed(1)
sample_size <- 1200
dates <- sample(seq(1,31),sample_size,replace = TRUE)
Monthss <- sample(seq(1,12),sample_size,replace = TRUE)
hrs <- sample(seq(1,23),sample_size,replace = TRUE)
minutes <- sample(seq(1,59),sample_size,replace = TRUE)
date_time_vector <- paste0(dates,"-",Monthss,"-",2022," ",hrs,":",minutes) |> lubridate::parse_date_time("dmy HM")
Conversion <- sample(c(TRUE,FALSE),sample_size, prob = c(0.25,0.75), replace = TRUE)
df <- data.frame(Date = date_time_vector, Conversion_Status = Conversion)
df$Leads <- round(runif(sample_size, min = 0,max = 10),digits = 0)
df <- df[complete.cases(df), ]
df$month_year <- strftime(df$Date, format = "%m-%Y")
df.split <- split(df, f = df$month_year)
out <- vector("list", length(df.split))
names(out) <- names(df.split)
for (i in seq_along(df.split)) {
out[[i]] <- ggplot(data = df.split[[i]], mapping = aes(x = Date, y = Leads)) +
geom_col(fill = "steelblue") +
theme(plot.background = element_rect(fill = "white")) +
theme(plot.title = element_text(hjust = 0.5))+
coord_flip() +
labs(title = "Daily leads by month")
}
To plot you can just print e.g. out[[1]].
If you want to change the desired columns dynamically, you can use aes_string for mapping. This can naturally be wrapped into sapply and there are probably other ways of approaching the problem. The for loop is pretty agnostic and I find that it's readable even by people who do not dabble in R (compared to say sapply).
There are some issues with your code. First, your dataset has no Month column, i.e. you have to add it for which I use lubridate::month. Second, you are passing the dataset df to your bar function instead of the splitted data column from your nested df. Third, in the mutate step you have to wrap the result in list():
library(ggplot2)
library(dplyr, warn=FALSE)
mycharts <- df |>
nest_by(Month = lubridate::month(Date)) |>
mutate(plot = list(bar_function(data, Date, Leads, "Daily Leads by Month")))
mycharts$plot[[1]]
mycharts$plot[[5]]
I finally found an answer. I used following code:
lapply(split(df, df$Month),
function(x)
ggplot(x, aes(x=Date, y=Leads)) +
geom_col(fill = "steelblue") + coord_flip()+
ggtitle(x$Month[1]))
Thank you all for your support.

Specification curve "choices" plot using ggplot2

I have a small dataset of estimates from many regressions of an outcome variable on a main treatment variable and then various sets of control variables (in fact, all possible combinations of those controls variables). The table of estimates is as follows:
df <-
structure(list(control_set = c("cen21_hindu_pct", "cen83_urban_pct",
"cen21_hindu_pct + cen83_urban_pct", "NONE"), xest = c(0.0124513609978549,
0.00427174623249021, 0.006447506098051, 0.0137107176362076),
xest_conf_low = c(0.00750677700140716, -0.00436301983024899,
-0.0013089334064237, 0.00925185534519074), xest_conf_high = c(0.0173959449943027,
0.0129065122952294, 0.0142039456025257, 0.0181695799272245
)), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"
))
I want to make the two plots for the classic "specification curve analysis." The top plot is simply the set of estimates ordered by the magnitude of the estimate on the main treatment variable (no issue here):
df %>%
arrange(xest) %>%
mutate(specifications = 1:nrow(.)) %>%
ggplot(aes(x = specifications, y = xest, ymin = xest_conf_low, ymax = xest_conf_high)) +
geom_pointrange(alpha = 0.1, size = 0.6, fatten = 1) +
labs(x = "", y = "Estimate\n") +
theme_bw()
My problem is with the aligned plot underneath that describes the control-set choices. Directly underneath each coefficient dot and whisker from the plot just made I want a plot that indicates the set of corresponding control variables that were included in that model (i.e. the list of controls in the control_set column in the df data frame row). So the plot I need in this example would look just like this:
This is a (failed) sketch of what I tried to get there, by modifying the earlier estimation dataset in long form, but I couldn't get multiple ticks to show vertically: (Note, this bit of code won't run)
# forplot %>%
# arrange(xest) %>%
# mutate(specifications = 1:nrow(.)) %>%
# mutate(value = "|") %>%
# ggplot(aes(specifications, term)) +
# geom_text(aes(label = value)) +
# scale_color_manual(values = c("lightblue")) +
# labs(x = "\nSpecification number", y = "") +
# theme_bw()
How can I use ggplot2 to make the plot-figure shown above from the information in the data frame, df?
If we define your plot as -> a...
library(patchwork)
b <- tibble(specifications = c(1,2,2,3),
control_set = rep(c("cen83_urban_pct", "cen21_hindu_pct"), each = 2)) %>%
ggplot(aes(specifications, control_set)) +
geom_text(aes(label = "|"), size = 5) +
coord_cartesian(xlim = c(1,4)) +
labs(x = NULL, y = NULL) +
theme_bw()+
theme(axis.ticks = element_blank(),
axis.text.x = element_blank())
a/b + plot_layout(heights = c(3,1))
If you want to generate the key automatically, you might use something like this:
library(dplyr)
df %>%
select(control_set) %>%
mutate(specifications = 1:4) %>%
separate_rows(control_set, sep = "\\+") %>%
mutate(control_set = trimws(control_set)) %>% # b/c my regex not good enough to trim spaces in line above
...
If you want to relabel the numbers in the y-axis with the control_set labels you can add
+ scale_y_continuous(breaks = df$xest, labels = df$control_set)

How to plot a(n unknown) number of data series as geom_line in same chart

My first Q here, so please go lightly if I'm out of step anywhere.
I'm trying to code R to produce a single chart to contain a number of data series lines. The number of data series may vary but will be provided in the data frame. I have tried to rearrange another thread's content to print the geom_line , but not successfully.
The logic is:
#desire to replace loop of 1:5 with ncol(df)
print(ggplot(df,aes(x=time))
for (i in 1:5) {
print (+ geom_line(aes(y=df[,i]))
}
#functioning geom point loops ggplot production:
for (i in 1:5) {
print(ggplot(df,aes(x=time,y=df[,i]))+geom_point())
}
#functioning multi-line ggplot where n is explicit:
ggplot(data=df, aes(x=time), group=1) +
geom_line(aes(y=df$`3`))+
geom_line(aes(y=df$`4`))
The functioning example code produces n number of point charts, 5 in this case. I would like just one chart to contain n line series.
This may be similar to How to plot n dimensional matrix? for which there are currently no relevant answers
Any contributions much appreciated, thanks
You can use gather from tidyverse "world" to do that.
As you didn't supply a sample data I used mtcars.
I created two data.frames one with 3 columns one with 9. In each one of them I plotted all of the variables against the variable mpg.
library(tidyverse)
df3Columns <- mtcars[, 1:4]
df9Columns <- mtcars[, 1:10]
df3Columns %>%
gather(var, value, -mpg) %>%
ggplot(aes(mpg, value, group = var, color = var)) +
geom_line()
df9Columns %>%
gather(var, value, -mpg) %>%
ggplot(aes(mpg, value, group = var, color = var)) +
geom_line()
Edit - using the sample data in comments.
library(tidyverse)
df %>%
rownames_to_column("time") %>%
gather(var, value, -time) %>%
ggplot(aes(time, value, group = var, color = var)) +
geom_line()
Sample data:
df <- structure(list("39083" = c(96, 100, 100), "39090" = c(99, 100, 100), "39097" = c(99, 100, 100)), row.names = 3:5, class = "data.frame")
To strictly answer your question, you can simply store your ggplot in a variable and add the geom_line one by one:
df <- structure(list("39083" = c(96, 100, 100), "39090" = c(99, 100, 100), "39097" = c(99, 100, 100)), row.names = 3:5, class = "data.frame")
g <- ggplot(df, aes(x = 1:nrow(df)))
for (i in colnames(df))
{
g <- g + geom_line(y = df[,i])
}
g <- g + scale_y_continuous(limits = c(min(df), max(df)))
print(g)
However, this is not a very convenient solution. I would highly recommend to refactor your data frame to be more ggplot style.
df.ultimate <- data.frame(time = numeric(), value = numeric(), group = character())
for (i in colnames(df))
{
df.ultimate <- rbind(df.ultimate, data.frame(time = 1:nrow(df), value = df[, i], group = i))
}
g <- ggplot(df.ultimate, aes(x = time, y = value, color = group))
g <- g + geom_line()
print(g)
A one-line solution:
ggplot(data.frame(time = rep(1:nrow(df), ncol(df)),
value = as.vector(as.matrix(df)),
group = rep(colnames(df), each = nrow(df))),
aes(x = time, y = value, color = group)) + geom_line()

How can I loop colnames as plot titles along with data using lapply in R?

I have this function that works close to what I need -- it creates a clean table from my original raw data, makes it a ggplot, and uses lapply to run it through all the variables I want from the original table, data:
#Get colnames of all numeric varaibles
nlist <- names(data[,sapply(data,is.numeric)])
#Create function
varviz_n <- function(dat, var){
var <- dat[,which(names(dat) == var)]
title<-var
tab <- dat %>%
group_by(group = cut(var, breaks = seq(0, max(var), 10)),
groupedsupport) %>%
summarise(n = n()) %>%
mutate(freq = n / sum(n)) %>%
filter(!is.na(group),n>10)
tab2 <- tab %>%
group_by(groupedsupport) %>%
summarise(mean = mean(freq),
median = median(freq))
finaltab <- tab %>% left_join(tab2, by = "groupedsupport")
fplot <- finaltab %>%
ggplot(aes(fill=group,x=groupedsupport,y=freq)) +
geom_col(position="dodge") +
geom_text(aes(label = paste("n =",n), n = (n + 0.05)), position = position_dodge(0.9), vjust = 0, size=2) +
geom_errorbar(aes(groupedsupport, ymax = median, ymin = mean),
size=0.5, linetype = "longdash", inherit.aes = F, width = 1) +
scale_y_continuous(labels = scales::percent) +
xlab("") + ylab("") +
ggtitle(title) +
scale_fill_discrete("")
filename = filename <- paste0(finaltab$var)
ggsave(paste("Plots/",filename,".png"), width = 10, height = 7)
return(fplot)
}
#Run function
lapply(nlist, varviz_n, dat = data)
This does almost exactly what I want -- the problem is that all of the variables it's running through are 0-100 numeric and it's creating the plots but I can't at all figure out how to get the column name as the title of the plot or of the key. So I have no idea which graph is getting returned.
Can someone please help me figure out a way to get the column name from nlist to be the title of my plot? The way it is now prints out the first value of the column instead of the actual column name:
The final piece of code to save it in the 'Plots' folder doesn't work either since the title/var isn't populating correctly.
You can use something like this to create data to test out the code: data <- data.frame(v1 = sample(1:100,1000,replace=T),v2 = sample(1:100,1000,replace=T),v3 = sample(1:100,1000,replace=T),groupedsupport = sample(LETTERS[1:3],1000,replace = TRUE))
Thanks!
I think you just need to swap these steps:
var <- dat[,which(names(dat) == var)]
title <- var
should be
title <- var
var <- dat[,which(names(dat) == var)]
var being assigned to the column of selected data so when it is called again in title, it is looking at that vector and not the column name.
If this doesn't resole it, please give us some code to mimic the contents of data.

Gap between forecast and actual data in ggplot

I am trying to plot some data, fitted values and forecasts on a nice ggplot format but when I plot my data the way I think should work I get a gap between the real data and the forecast. The gap is meaningless but it would be nice if it was gone.
Some R code you can use to recreate my problem is:
library(xts)
library(tidyverse)
library(forecast)
dates <- seq(as.Date("2016-01-01"), length = 100, by = "days")
realdata <- arima.sim(model = list(ar = 0.7, order = c(1,1,0)), n = 99)
data <- xts(realdata, order.by = dates)
user_arima <- arima(data, order = c(1,1,0))
user_arimaf <- forecast(user_arima)
fits <- xts(user_arimaf$fitted, order.by = dates)
fcastdates <- as.Date(dates[100]) + 1:10
meancast <- xts(user_arimaf$mean[1:10], order.by = fcastdates)
lowercast95 <- xts(user_arimaf$lower[1:10], order.by = fcastdates)
uppercast95 <- xts(user_arimaf$upper[1:10], order.by = fcastdates)
frame <- merge(data, fits, meancast, uppercast95, lowercast95, all = TRUE, fill = NA)
frame <- as.data.frame(frame) %>%
mutate(date = as.Date(dates[1] + 0:(109)))
frame %>%
ggplot() +
geom_line(aes(date, data, color = "Data")) +
geom_line(aes(date, fits, color = "Fitted")) +
geom_line(aes(date, meancast, color = "Forecast")) +
geom_ribbon(aes(date, ymin=lowercast95,ymax=uppercast95),alpha=.25) +
scale_color_manual(values = c(
'Data' = 'black',
'Fitted' = 'red',
'Forecast' = 'darkblue')) +
labs(color = 'Legend') +
theme_classic() +
ylab("some data") +
xlab("Date") +
labs(title = "chart showing a gap",
subtitle = "Shaded area is the 95% CI from the ARIMA")
And the chart is below
I know there is a geom_forecast in ggplot now but I would like to build this particular plot the way i'm doing it. Although if there's no other solution to the gap then i'll use the geom_forecast.
Closing the gap requires providing a data point in the meancast column for the blank area. I guess it makes sense just to use the value for the last "real" data point.
# Grab the y-value corresponding to the date just before the gap.
last_data_value = frame[frame$date == as.Date("2016-04-09"), "data"]
# Construct a one-row data.frame.
extra_row = data.frame(data=NA_real_,
fits=NA_real_,
meancast=last_data_value,
uppercast95=last_data_value,
lowercast95=last_data_value,
date=as.Date("2016-04-09"))
# Add extra row to the main data.frame.
frame = rbind(frame, extra_row)

Resources