Assign name of data frame to ggplot title in purrr loop - r

I have two data sets from which I would like to generate histograms showing how the data overlap by name (A, B, C). I have written a custom function so I can use ggplot with map2.
I would like the graphs to be titled according to the name of each data set, so "A", "B", "C." Does anyone know of a way to do this?
# load packages
library(ggplot2)
library(dplyr)
library(purrr)
## load and format data 1
df1_raw <- data.frame(name = c("A", "B", "C", "A", "C", "B"),
start = c(1, 3, 4, 5, 2, 1),
end = c(6, 5, 7, 8, 6, 7))
df1 <- split(x = df1_raw, f = df1_raw$name) # split data by name
df1 <- lapply(df1, function(x) Map(seq.int, x$start, x$end)) # generate sequence intervals
df1 <- map(df1, unlist) # unlist sequences
df1 <- lapply(df1, data.frame) # convert to df
## load and format data 2
df2_raw <- data.frame(name = c("C", "B", "C", "A", "A", "B"),
start = c(5, 4, 3, 4, 4, 5),
end = c(7, 8, 7, 6, 9, 6))
df2 <- split(x = df2_raw, f = df2_raw$name) # split data by name
df2 <- lapply(df2, function(x) Map(seq.int, x$start, x$end)) # generate sequence intervals
df2 <- map(df2, unlist) # unlist sequences
df2 <- lapply(df2, data.frame) # convert to df
## write custom ggplot function and generate graphs
gplot <- function(data1, data2) {
ggplot() +
geom_histogram(data = data1, aes(x = X..i..), binwidth = 1, color = "grey", fill = "grey") +
geom_histogram(data = data2, aes(x = X..i..), binwidth = 1, fill = "pink", alpha = 0.7) +
labs(
title = ls(data1))
}
hist <- map2(df1, df2, gplot)
I also tried the following in the title field in my function:
deparse(substitute(data1))

Another similar option to what #GregorThomas mentioned in the comments, you could add a name variable to your data.frames and pull from that in your gplot() function. I've also shown how you might combine a few of your data manipulation steps:
# load packages
library(ggplot2)
library(dplyr)
library(purrr)
## load and format data 1
df1_raw <- data.frame(name = c("A", "B", "C", "A", "C", "B"),
start = c(1, 3, 4, 5, 2, 1),
end = c(6, 5, 7, 8, 6, 7))
df1 <- df1_raw %>%
split(.$name) %>% # split data by name
imap(function(x, x_name) {
data.frame(value = Map(seq.int, x$start, x$end) %>% unlist,
name = x_name)
})
## load and format data 2
df2_raw <- data.frame(name = c("C", "B", "C", "A", "A", "B"),
start = c(5, 4, 3, 4, 4, 5),
end = c(7, 8, 7, 6, 9, 6))
df2 <- df2_raw %>%
split(.$name) %>% # split data by name
imap(function(x, x_name) {
data.frame(value = Map(seq.int, x$start, x$end) %>% unlist,
name = x_name)
})
## change the title component of your previous function
gplot <- function(data1, data2) {
ggplot() +
geom_histogram(data = data1, aes(x = value), binwidth = 1, color = "grey", fill = "grey") +
geom_histogram(data = data2, aes(x = value), binwidth = 1, fill = "pink", alpha = 0.7) +
ggtitle(data1$name[1])
}
## plot it
map2(df1, df2, gplot)

Related

Why is the standard deviation mismatched in barplot when I am using plotly and crosstalk

I want to produce barplot using plotly and crosstalk, but I found that in the plot, the standard deviations are mismatched.
I don't understand the reason.
Here is my demo data:
library(plotly)
library(crosstalk)
BPData <- data.frame(
Metabolite = rep(c("X", "Y"), each = 5),
Group = rep(c("A", "B", "C", "D", "E"), 2),
MEAN = c(7, 6, 7, 3, 7, 7, 7, 8, 7, 7),
SD = 1:10
)
> BPData
Metabolite Group MEAN SD
1 X A 7 1
2 X B 6 2
3 X C 7 3
4 X D 3 4
5 X E 7 5
6 Y A 7 6
7 Y B 7 7
8 Y C 8 8
9 Y D 7 9
10 Y E 7 10
Below is my minimal example:
BPData <- data.frame(
Metabolite = rep(c("X", "Y"), each = 5),
Group = rep(c("A", "B", "C", "D", "E"), 2),
MEAN = c(7, 6, 7, 3, 7, 7, 7, 8, 7, 7),
SD = 1:10
)
BPData <- crosstalk::SharedData$new(BPData)
metaboliteFilter <- crosstalk::filter_select(id = "Metabolite",
label = "Select a metabolite",
sharedData = BPData,
group = ~ Metabolite,
multiple = F
)
BPPlot <- plotly::plot_ly(data = BPData,
x = ~ Group,
y = ~ MEAN,
color = ~ Group,
type = 'bar',
error_y = ~list(array = SD, color = '#808080')
) %>%
layout(yaxis = list(categoryorder = "trace"))
filter <- crosstalk::bscols(
metaboliteFilter,
BPPlot
)
crosstalk::bscols(filter)
This is a very strange bug of plotly.
The error bars appear in the wrong order when grouping data by color.
So the solution is either to remove the color parameter or to re-order to data by the colored variable before making plot.
Here is my solution:
library(plotly)
library(crosstalk)
BPData <- data.frame(
Metabolite = rep(c("X", "Y"), each = 5),
Group = rep(c("A", "B", "C", "D", "E"), 2),
MEAN = c(7, 6, 7, 3, 7, 7, 7, 8, 7, 7),
SD = 1:10
)
# must order data by the colored variable here
BPData <- BPData[order(BPData$Group),]
BPData <- crosstalk::SharedData$new(BPData)
metaboliteFilter <- crosstalk::filter_select(id = "Metabolite",
label = "Select a metabolite",
sharedData = BPData,
group = ~ Metabolite,
multiple = F
)
BPPlot <- plotly::plot_ly(data = BPData,
x = ~ Group,
y = ~ MEAN,
color = ~ Group,
error_y = ~list(array = SD, color = '#808080'),
type = 'bar'
) %>%
layout(yaxis = list(categoryorder = "trace"))
filter <- crosstalk::bscols(
metaboliteFilter,
BPPlot
)
crosstalk::bscols(filter)

Plot TukeyHSD as Heatmap

Can I present the result of TukeyHSD as a heatmap? And how would the code look like concerning the example below?
#Daten erstellen
set.seed (0)
data <- data.frame(group = rep(c("A", "B", "C"), each = 30),
values = c(runif(30, 0, 3),
runif(30, 0, 5),
runif(30, 1, 7)))
#Die ersten sechs Zeilen anzeigen
head(data)
#einfaktorielles ANOVA-Modell anpassen
model <- aov(values~group, data=data)
#Sehen Sie sich die Modellausgabe an
summary(model)
#Tukey Test durchführen
TukeyHSD(model, conf.level=.95)
#Konfidenzintervalle plotten
plot(TukeyHSD(model, conf.level=.95), las = 2)
Thank you so much!!
I actually get the right results, but can't display them as a heatmap.
Here a way to do it with a tidyverse approach
library(dplyr)
library(lubridate)
library(tidyr)
data <- data.frame(group = rep(c("A", "B", "C"), each = 30),
values = c(runif(30, 0, 3),
runif(30, 0, 5),
runif(30, 1, 7)))
#einfaktorielles ANOVA-Modell anpassen
model <- aov(values~group, data=data)
#Tukey Test durchführen
test_list <- TukeyHSD(model, conf.level=.95)
test_data <- test_list$group
test_data %>%
as_tibble() %>%
bind_cols(data.frame(rw = rownames(test_data))) %>%
separate(rw,into = c("var1","var2")) %>%
ggplot(aes(x = var1,y = var2, fill = `p adj`))+
geom_tile()+
geom_text(aes(label = p.adjust(`p adj`)), color = "white")

How to trace count of events in two groups over time?

I have those data:
require(tidyverse)
(df <- tribble( ~id, ~group, ~event,
1, "A", "2010/01/23",
2, "B", "2010/02/13",
3, "A", "2011/03/21",
4, "B", "2010/01/20",
5, "B", "2012/11/03",
6, "A", "2012/08/12"))
(df$event <- as.POSIXct(df$event, format = "%Y/%m/%d")
And I want to trace a graphic with x = time / y = count of events / color = group.
This is how I did it:
(
df
%>% group_by(group)
%>% arrange(event)
%>% mutate(N = row_number())
%>% ggplot(aes(x=event, y=N, color=group))
+ geom_line()
)

How can I reclassify a categorical raster in terra using category names

I'd like to be able to merge two categories in a categorical raster. The only solution I've figured out so far uses the level index number, not the name of the category. How could I do this using the name of the category?
library(terra)
m <- matrix(rep(c("a", "b", "c"), each = 3), nrow = 3, ncol = 3)
x <- rast(m)
x[x$lyr.1 == "c"]
m2 <- matrix(c("a", "a", "b", "b", "c", "b"), nrow = 3, ncol = 2, byrow = TRUE)
test <- classify(x, m2)
#doesn't work with category names
test <- subst(x, "c", "b")
#doesn't work with category names
test <- subst(x, 2, 1)
#works with category index
Example data
library(terra)
m <- matrix(rep(c("a", "b", "c"), each = 3), nrow = 3, ncol = 3)
x <- rast(m)
m2 <- matrix(c("a", "a", "b", "b", "c", "b"), nrow = 3, ncol = 2, byrow = TRUE)
With the current version of terra you can do either
test1 <- subst(x, "c", "b")
or
test2 <- subst(x, 2, 1, raw=TRUE)
library(terra)
library(tidyverse)
m <- matrix(rep(c("a", "b", "c"), each = 3), nrow = 3, ncol = 3)
x <- rast(m)
plot(x)
reclassified <- cats(x)[[1]] %>%
mutate(label_reclass = forcats::fct_collapse(cats(x)[[1]]$label,c="b"))
x <- categories(x, layer=1, value=reclassified, active=2)
plot(x)
levels(x)
[[1]]
value label_reclass
1 1 a
2 2 c
3 3 c

Find predictions for linear model that is grouped_by

I would like to get predicted values based on a model I fit to a training set of data. I have done this before, but now I have a grouping factor and it is throwing me off. I want to predict biomass based on population for each environment.
library(tidyverse)
fit_mods<-df %>%
group_by(environ) %>%
do(model = lm(biomass ~ poly(population, 2), data = .))
Ultimately, I will want to find at which population biomass is the greatest. Usually I would do this by creating a grid and running the model on my new values and finding the max value, but I'm blanking on how to do this with the grouping. Usual way:
min_pop <- min(df$population)
max_pop <- max(df$population)
grid_pop <- expand.grid(new = (seq(from = min_pop,
to = max_pop,
length.out = 1000)),
environ = c("A", "B"))
#This is what I did with ungrouped data, but doesn't work now.
pred_pop <- predict(object = fit_mods,
newdata = grid_pop,
interval = "predict")
Here is some dummy data:
df <- as.data.frame(list(environ = c("a", "a", "a", "a", "a", "b", "b", "b", "b", "b"),
population = c(2, 3, 4, 5, 6, 3, 4, 5, 6, 7),
biomass = c(1, 2.2, 3.5, 4.1, 3.8, 2.5, 3.6, 4.3, 5.2, 5.1)), class = "data.frame")
In a tidyverse many models approach you could do it the following way:
library(tidyverse)
fit_mods <- df %>%
nest(-environ) %>%
mutate(models = map(data, ~ lm(biomass ~ poly(population, 2), data = .x)),
min_pop = map_dbl(data, ~ pull(.x, population) %>% min),
max_pop = map_dbl(data, ~ pull(.x, population) %>% max),
new = map2(min_pop, max_pop, ~ tibble(population = seq(from = .x,
to = .y,
length.out = 1000))),
pred = map2(models,
new,
~ predict(object = .x,
newdata = select(.y,population),
interval = "predict")))

Resources