I would like to compute differences among several columns, per identifiers (see script below for reproducible example and target data frame).
This question is somehow similar, but only for pairs of identifiers. I can't think on how to adapt it.
I could also have several data frame, one per identifier, but I also don't know in that case how to compute multiple columns differences.
The code below allows to create a sample dataset, and has the code I currently use. It gives me what I want, I'd just like to know if there is a way not to spell out all the differences I want to compute (in my dataset, I have more parameters and depths than in that sample data).
Thanks in advance for your help!
library(tidyverse)
# sample data
create.dt <- function(t = 0) {
data.frame(parameter = rep(c("temperature","oxygen"), each = 3),
date = rep(c(Sys.Date()+t), each = 6),
depth = rep(1:3, times = 2),
value = c(data.frame(x = rnorm(3, 16, 2)) %>%
arrange(-x) %>% pull,
data.frame(x = rnorm(3, 7, 1)) %>%
arrange(-x) %>% pull
))
}
# Multi-site dataset
dt <- rbind(
cbind(site = "A", create.dt(t = c(-3:0))),
cbind(site = "B", create.dt(t = c(-3:0))),
cbind(site = "C", create.dt(t = c(-3:0))),
cbind(site = "D", create.dt(t = c(-3:0))),
cbind(site = "E", create.dt(t = c(-3:0))))
# Reshape the data and compute differences
dt %>% pivot_wider(id_cols = c(site,date), names_from = c(parameter,depth), values_from = value, names_sep = "_") %>%
# do the difference, depth to depth, parameter by parameter
# What I would like is not have to write manually each differences pair
mutate(temperature_1_2 = temperature_1 - temperature_2,
temperature_1_3 = temperature_1 - temperature_3,
temperature_2_3 = temperature_2 - temperature_3,
oxygen_1_2 = oxygen_1 - oxygen_2,
oxygen_1_3 = oxygen_1 - oxygen_3,
oxygen_2_3 = oxygen_2 - oxygen_3)
library(tidyverse)
library(rlang)
create.dt <- function(t = 0) {
data.frame(parameter = rep(c("temperature","oxygen"), each = 3),
date = rep(c(Sys.Date()+t), each = 6),
depth = rep(1:3, times = 2),
value = c(data.frame(x = rnorm(3, 16, 2)) %>%
arrange(-x) %>% pull,
data.frame(x = rnorm(3, 7, 1)) %>%
arrange(-x) %>% pull
))
}
# Multi-site dataset
dt <- rbind(
cbind(site = "A", create.dt(t = c(-3:0))),
cbind(site = "B", create.dt(t = c(-3:0))),
cbind(site = "C", create.dt(t = c(-3:0))),
cbind(site = "D", create.dt(t = c(-3:0))),
cbind(site = "E", create.dt(t = c(-3:0))))
# result
temperature <- str_c("temperature_", 1:3)
oxygen <- str_c("oxygen_", 1:3)
temperature_frml <- combn(temperature, m = 2, FUN = function(x) str_c(x, collapse = " - "))
oxygen_frml <- combn(oxygen, m = 2, FUN = function(x) str_c(x, collapse = " - "))
all_frml <- c(temperature_frml, oxygen_frml)
df_wider <- dt %>% pivot_wider(
id_cols = c(site, date),
names_from = c(parameter, depth),
values_from = value,
names_sep = "_"
)
bind_cols(df_wider,
map_dfc(
.x = all_frml,
.f = ~ transmute(.data = df_wider,!!.x := eval(parse_expr(.x)))
))
#> # A tibble: 20 x 14
#> site date temperature_1 temperature_2 temperature_3 oxygen_1 oxygen_2
#> <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A 2021-12-11 17.6 17.1 12.9 7.34 6.86
#> 2 A 2021-12-12 17.6 17.1 12.9 7.34 6.86
#> 3 A 2021-12-13 17.6 17.1 12.9 7.34 6.86
#> 4 A 2021-12-14 17.6 17.1 12.9 7.34 6.86
#> 5 B 2021-12-11 17.1 15.6 13.7 8.52 7.58
#> 6 B 2021-12-12 17.1 15.6 13.7 8.52 7.58
#> 7 B 2021-12-13 17.1 15.6 13.7 8.52 7.58
#> 8 B 2021-12-14 17.1 15.6 13.7 8.52 7.58
#> 9 C 2021-12-11 17.7 15.5 13.6 7.66 7.31
#> 10 C 2021-12-12 17.7 15.5 13.6 7.66 7.31
#> 11 C 2021-12-13 17.7 15.5 13.6 7.66 7.31
#> 12 C 2021-12-14 17.7 15.5 13.6 7.66 7.31
#> 13 D 2021-12-11 16.5 16.4 14.5 7.50 7.27
#> 14 D 2021-12-12 16.5 16.4 14.5 7.50 7.27
#> 15 D 2021-12-13 16.5 16.4 14.5 7.50 7.27
#> 16 D 2021-12-14 16.5 16.4 14.5 7.50 7.27
#> 17 E 2021-12-11 16.7 16.1 15.7 7.52 7.51
#> 18 E 2021-12-12 16.7 16.1 15.7 7.52 7.51
#> 19 E 2021-12-13 16.7 16.1 15.7 7.52 7.51
#> 20 E 2021-12-14 16.7 16.1 15.7 7.52 7.51
#> # ... with 7 more variables: oxygen_3 <dbl>,
#> # temperature_1 - temperature_2 <dbl>, temperature_1 - temperature_3 <dbl>,
#> # temperature_2 - temperature_3 <dbl>, oxygen_1 - oxygen_2 <dbl>,
#> # oxygen_1 - oxygen_3 <dbl>, oxygen_2 - oxygen_3 <dbl>
Created on 2021-12-14 by the reprex package (v2.0.1)
Related
I'm having trouble with referring to a dynamic name inside a for loop. I have the following dataframe:
library("tidyverse")
set.seed(10)
df <- data.frame(group = rep(LETTERS[1:3], each = 100),
measure1 = runif(300, min = 20, max = 30),
measure2 = runif(300, min = 10, max = 20),
risk = rbinom(n=300, size=1, prob=0.05))
df[c(20,21,103),2] <- NA
df[c(44,80,201),3] <- NA
df[c(61,98,207),4] <- NA
in which i calculate limits:
df %>% group_by(group)%>%
mutate(LLm1 = quantile(measure1[risk == 0], prob = c(0.05), na.rm = TRUE),
ULm2 = quantile(measure2[risk == 0], prob = c(0.95), na.rm = TRUE))%>% ungroup -> df
Now I would like to calculate which rows are outside a certain interval, and i would like to vary this interval according to some predefined additions or subtractions
for (k in seq(-2, 2, length.out = 5)){
for (i in seq(0.7, 1.0, length.out = 4)){
df %>%
group_by(group) %>%
mutate(
!!paste0("new", format(i, nsmall=1), "var", format(k, nsmall=0)) := ifelse(measure1 < (LLm1 - k) & measure2 >= (ULm2 - i), 1, ifelse(is.na(measure1) | is.na(measure2), NA, 0)),
) %>% ungroup -> df
}
}
So far this works fine, i'd like to however make another variable based on the dynamic variables im making, like this:
for (k in seq(-2, 2, length.out = 5)){
for (i in seq(0.7, 1.0, length.out = 4)){
df %>%
group_by(group) %>%
mutate(
!!paste0("var", format(i, nsmall=1), "_", format(k, nsmall=0)) := ifelse(measure1 < (LLm1 - k) & measure2 >= (ULm2 - i), 1, ifelse(is.na(measure1) | is.na(measure2), NA, 0)),
!!paste0("new", format(i, nsmall=1), "_", format(k, nsmall=0)) := ifelse((!!paste0("var", format(i, nsmall=1), "_", format(k, nsmall=0))) == 1 & risk == 1, 1, ifelse(is.na(measure1) | is.na(measure2), NA, 0)),
) %>% ungroup -> df
}
}
Unfortunately im not getting the desired results, as a row who has 1 in the new variable 'var' and a 1 in 'risk' does not get a 1 but it gets a 0. I've tried some alternatives with brackets and eval() but the result stays the same. Can anyone show me were I'm wrong in the syntax or help me explain how to refer to a dynamic name inside the for loop?
You have to wrap the string with the variable name after the double-bang operator !! in sym() to make sure it is treated as a name.
Further, as I pointed out in my comment, the condition risk == 1 in ifelse is never met, so it seems like its not working, so for the example at hand, I dropped that condition.
for (k in seq(-2, 2, length.out = 5)){
for (i in seq(0.7, 1.0, length.out = 4)){
df %>%
group_by(group) %>%
mutate(
!!paste0("var", format(i, nsmall=1), "_", format(k, nsmall=0)) := ifelse(measure1 < (LLm1 - k) & measure2 >= (ULm2 - i), 1, ifelse(is.na(measure1) | is.na(measure2), NA, 0)),
!!paste0("new", format(i, nsmall=1), "_", format(k, nsmall=0)) := ifelse((!! sym(paste0("var", format(i, nsmall=1), "_", format(k, nsmall=0)))) == 1, 1, ifelse(is.na(measure1) | is.na(measure2), NA, 0)),
) %>% ungroup -> df
}
}
df %>% filter(if_any(starts_with("new"), ~ .x != 0))
#> # A tibble: 23 x 46
#> group measure1 measure2 risk LLm1 ULm2 `var0.7_-2` `new0.7_-2` `var0.8_-2`
#> <chr> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A 22.6 19.7 0 20.6 19.5 1 1 1
#> 2 A 21.0 19.3 0 20.6 19.5 1 1 1
#> 3 A 21.8 19.8 0 20.6 19.5 1 1 1
#> 4 A 21.9 19.0 0 20.6 19.5 1 1 1
#> 5 A 22.2 18.9 1 20.6 19.5 1 1 1
#> 6 A 21.7 19.5 0 20.6 19.5 1 1 1
#> 7 B 22.1 19.4 0 20.6 19.4 1 1 1
#> 8 B 22.1 18.6 0 20.6 19.4 0 0 0
#> 9 B 20.6 19.6 0 20.6 19.4 1 1 1
#> 10 B 22.5 18.7 0 20.6 19.4 0 0 1
#> # ... with 13 more rows, and 37 more variables: new0.8_-2 <dbl>,
#> # var0.9_-2 <dbl>, new0.9_-2 <dbl>, var1.0_-2 <dbl>, new1.0_-2 <dbl>,
#> # var0.7_-1 <dbl>, new0.7_-1 <dbl>, var0.8_-1 <dbl>, new0.8_-1 <dbl>,
#> # var0.9_-1 <dbl>, new0.9_-1 <dbl>, var1.0_-1 <dbl>, new1.0_-1 <dbl>,
#> # var0.7_0 <dbl>, new0.7_0 <dbl>, var0.8_0 <dbl>, new0.8_0 <dbl>,
#> # var0.9_0 <dbl>, new0.9_0 <dbl>, var1.0_0 <dbl>, new1.0_0 <dbl>,
#> # var0.7_1 <dbl>, new0.7_1 <dbl>, var0.8_1 <dbl>, new0.8_1 <dbl>, ...
Another way to approach the problem is to use the dplyover package (disclaimer: I'm the maintainer), and here the funciton dplyover::over2x() which generates columns in a nested loop style based on the input objects.
After dplyover::over2x() we can just use a regular call to across() and target all variables that start_with("var").
library(dplyover)
df %>%
group_by(group) %>%
mutate(
over2x(seq(-2, 2, length.out = 5),
seq(0.7, 1.0, length.out = 4),
~ ifelse(measure1 < (LLm1 - .x) & measure2 >= (ULm2 - .y), 1, ifelse(is.na(measure1) | is.na(measure2), NA, 0)),
.names = "var{y}_{x}"
),
across(starts_with("var"),
~ ifelse(.x == 1, 1,
ifelse(is.na(measure1) | is.na(measure2),
NA, 0)),
.names = "{gsub('var', 'new', {.col})}")
) %>%
ungroup()
#> # A tibble: 300 x 46
#> group measure1 measure2 risk LLm1 ULm2 `var0.7_-2` `var0.8_-2` `var0.9_-2`
#> <chr> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A 27.2 12.6 0 20.8 19.3 0 0 0
#> 2 A 29.0 14.1 0 20.8 19.3 0 0 0
#> 3 A 29.3 14.6 0 20.8 19.3 0 0 0
#> 4 A 25.4 14.4 0 20.8 19.3 0 0 0
#> 5 A 25.2 12.3 0 20.8 19.3 0 0 0
#> 6 A 25.6 10.0 0 20.8 19.3 0 0 0
#> 7 A 28.2 10.3 0 20.8 19.3 0 0 0
#> 8 A 27.4 19.7 0 20.8 19.3 0 0 0
#> 9 A 24.8 19.3 0 20.8 19.3 0 0 0
#> 10 A 22.7 18.0 0 20.8 19.3 0 0 0
#> # ... with 290 more rows, and 37 more variables: var1_-2 <dbl>,
#> # var0.7_-1 <dbl>, var0.8_-1 <dbl>, var0.9_-1 <dbl>, var1_-1 <dbl>,
#> # var0.7_0 <dbl>, var0.8_0 <dbl>, var0.9_0 <dbl>, var1_0 <dbl>,
#> # var0.7_1 <dbl>, var0.8_1 <dbl>, var0.9_1 <dbl>, var1_1 <dbl>,
#> # var0.7_2 <dbl>, var0.8_2 <dbl>, var0.9_2 <dbl>, var1_2 <dbl>,
#> # new0.7_-2 <dbl>, new0.8_-2 <dbl>, new0.9_-2 <dbl>, new1_-2 <dbl>,
#> # new0.7_-1 <dbl>, new0.8_-1 <dbl>, new0.9_-1 <dbl>, new1_-1 <dbl>, ...
The data
df <- data.frame(group = rep(LETTERS[1:3], each = 100),
measure1 = runif(300, min = 20, max = 30),
measure2 = runif(300, min = 10, max = 20),
risk = rbinom(n=300, size=1, prob=0.05))
df[c(20,21,103),2] <- NA
df[c(44,80,201),3] <- NA
df[c(61,98,207),4] <- NA
library(dplyr)
df %>% group_by(group)%>%
mutate(LLm1 = quantile(measure1[risk == 0], prob = c(0.05), na.rm = TRUE),
ULm2 = quantile(measure2[risk == 0], prob = c(0.95), na.rm = TRUE))%>% ungroup -> df
Created on 2022-11-25 by the reprex package (v2.0.1)
I have the following sample data:
samplesize=100
df <- data.frame(sex = sample(c("M", "F"), size = samplesize, replace = TRUE),
agegrp = sample(c("old", "middle", "young"), size = samplesize, replace = TRUE),
duration1 = runif(samplesize, min = 1, max = 100),
duration2 = runif(samplesize, min = 1, max = 100),
country = sample(c("USA", "CAN"), size = samplesize, replace = TRUE))
df
My goal is to plot a table like this that displays the median values [median(na.rm = TRUE) as there might be missing values]
USA CAN
total old middle young M F total old middle young M F
duration1 10.2 12.2 13.1 10.2 13.0 13.9 ... ... ... ... ... ...
duration2 10.4 13.2 13.2 10.0 13.1 14.0 ... ... ... ... ... ...
The way I would usually calculate such a table is to calculate the median values columnwise:
df %>%
group_by(country, agegrp) %>%
summarise(dur1 = median(duration1, na.rm = TRUE),
dur2 = median(duration, na.rm = TRUE)
And finally I put all the columns together. Unfortunately, as the number of combinations gets bigger, this methods becomes very cumbersome. So my question is:
Is there any function like table() that let's me calculate means or medians (instead of frequencies) using specific combinations of variables?
It would also be fine if it was just a two-dimensional table with multi-dimensional variable names like:
USA_total USA_old USA_middle USA_young USA_m USA_f CAN_total ...
duration1
duration2
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -c(sex, agegrp, country), names_to = "parameters") %>%
group_by(agegrp, country, parameters) %>%
summarise(mean = mean(value, na.rm=TRUE)) %>%
pivot_wider(names_from = c(country, agegrp), values_from = mean)
Returns:
parameters CAN_middle USA_middle CAN_old USA_old CAN_young USA_young
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 duration1 48.6 62.6 31.5 40.0 43.0 50.5
2 duration2 60.9 54.0 53.1 58.9 45.1 55.6
Edit
Including M and F:
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = c(sex, agegrp), names_to = "groupings_names", values_to="groupings") %>%
select(-groupings_names) %>%
pivot_longer(cols = -c(groupings, country), names_to = "parameters") %>%
group_by(groupings, country, parameters) %>%
summarise(mean = mean(value, na.rm=TRUE)) %>%
pivot_wider(names_from = c(country, groupings), values_from = mean)
parameters CAN_F USA_F CAN_M USA_M CAN_middle USA_middle CAN_old USA_old CAN_young USA_young
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 duration1 63.3 59.0 50.9 47.7 57.9 46.1 56.8 60.6 59.5 49.1
2 duration2 60.6 59.0 54.9 48.3 65.0 45.6 48.5 49.5 55.8 62.4
I have a data. table
h1 <- c(rnorm(50, mean = 50, sd = 1),
rnorm(50, mean = 60, sd = 1),
rnorm(50, mean = 70, sd = 1),
rnorm(50, mean = 80, sd = 1))
w1 <- c(rnorm(150, mean = 150, sd = 1),
rnorm(150, mean = 160, sd = 1),
rnorm(150, mean = 170, sd = 1),
rnorm(150, mean = 180, sd = 1))
e1 <- c(rnorm(150, mean = 150, sd = 1),
rnorm(150, mean = 160, sd = 1),
rnorm(150, mean = 170, sd = 1),
rnorm(150, mean = 180, sd = 1))
h2 <- c(rnorm(50, mean = 50, sd = 1),
rnorm(50, mean = 60, sd = 1),
rnorm(50, mean = 70, sd = 1),
rnorm(50, mean = 80, sd = 1))
w2 <- c(rnorm(150, mean = 150, sd = 1),
rnorm(150, mean = 160, sd = 1),
rnorm(150, mean = 170, sd = 1),
rnorm(150, mean = 180, sd = 1))
e2 <- c(rnorm(150, mean = 150, sd = 1),
rnorm(150, mean = 160, sd = 1),
rnorm(150, mean = 170, sd = 1),
rnorm(150, mean = 180, sd = 1))
df <- data.frame(h1,w1,e1,h2,w2,e2)
> df
h1 w1 e1 h2 w2 e2
1 49.85148 148.6694 149.4619 49.05355 151.1857 147.7629
2 49.81708 149.7126 149.1840 50.75627 150.4471 149.2853
I would like to find the difference between the columns in data. table?
what I want to get:
h1
w1
e1
h2
w2
e2
h2-h1
w2-w1
e2-e1
49.85148
148.6694
149.4619
49.05355
151.1857
147.7629
-0,79793
2,5163
-1,699
You can divide the dataframe in half and subtract the second part with the first one and assign new columns names.
n <- ncol(df)
col1 <- 1:(n/2)
col2 <- (n/2 + 1):n
new_col_name <- paste(names(df)[col2], names(df)[col1], sep = '-')
df[new_col_name] <- df[col2] - df[col1]
head(df)
# h1 w1 e1 h2 w2 e2 h2-h1 w2-w1 e2-e1
#1 49.43 149.6 150.2 49.39 149.4 150.1 -0.03665 -0.193458 -0.09741
#2 50.10 149.7 150.8 49.03 149.6 149.6 -1.07812 -0.053813 -1.25975
#3 50.05 149.8 150.7 48.42 149.8 151.0 -1.62448 -0.007319 0.32304
#4 49.77 149.7 148.8 49.92 148.7 149.1 0.15132 -1.005730 0.23139
#5 49.44 149.9 151.0 48.39 150.9 150.0 -1.04673 0.977863 -0.97748
#6 49.58 148.8 151.1 50.41 150.6 148.6 0.83088 1.800697 -2.52930
Perhaps this:
library(tidyverse)
str_sub(names(df), 1, 1) %>%
unique() %>%
map_dfc(~ select(df,starts_with(.x)) %>% transmute('{.x}1-{.x}2' := .[, 1] - .[, 2])) %>%
{bind_cols(df, .)} %>% as.tibble()
#> Warning: `as.tibble()` was deprecated in tibble 2.0.0.
#> Please use `as_tibble()` instead.
#> The signature and semantics have changed, see `?as_tibble`.
#> # A tibble: 600 x 9
#> h1 w1 e1 h2 w2 e2 `h1-h2` `w1-w2` `e1-e2`
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 49.6 152. 151. 48.4 151. 148. 1.22 1.13 2.90
#> 2 50.1 150. 150. 51.6 150. 150. -1.50 -0.301 0.552
#> 3 50.9 150. 150. 50.4 151. 148. 0.474 -1.28 1.64
#> 4 51.6 149. 152. 51.0 150. 149. 0.599 -0.613 2.78
#> 5 50.2 149. 150. 51.4 150. 152. -1.27 -1.24 -2.44
#> 6 50.9 149. 152. 50.2 151. 151. 0.682 -2.18 0.815
#> 7 49.4 149. 149. 49.0 151. 149. 0.315 -2.57 -0.858
#> 8 50.3 151. 152. 50.4 148. 149. -0.0959 2.38 2.70
#> 9 50.3 151. 151. 49.0 150. 150. 1.30 0.611 0.749
#> 10 51.5 149. 150. 50.0 150. 149. 1.47 -1.21 0.366
#> # … with 590 more rows
Created on 2021-06-25 by the reprex package (v2.0.0)
Variation on the idea of splitting names up and subtracting:
sel <- names(df)[endsWith(names(df), "1")]
df[sprintf("%1$s1-%1$s2", sub("[12]$", "", sel))] <- df[sel] - df[sub("1", "2", sel)]
head(df)
# h1 w1 e1 h2 w2 e2 h1-h2 w1-w2 e1-e2
#1 49.93223 150.9997 149.2362 50.34892 150.3622 148.6164 -0.4166869 0.6375487 0.6198452
#2 48.83462 149.7938 150.9722 49.24049 150.0979 149.2758 -0.4058752 -0.3040331 1.6964756
#3 49.55578 146.8567 149.4173 48.61832 150.7250 148.2298 0.9374638 -3.8682714 1.1875108
I have a list and a list of lists and would like to create a data.frame or data.table.
Here is the list:
head(stadte_namen)
[1] "Berlin" "Hamburg" "München"
and a list of lists
> head(result)
[[1]]
min max
x 13.22886 13.54886
y 52.35704 52.67704
[[2]]
min max
x 9.840654 10.16065
y 53.390341 53.71034
[[3]]
min max
x 11.36078 11.72291
y 48.06162 48.24812
How could I create a data.frame or a data.table with the following structure?
name xmin ymin xmax ymax
Berlin 13.22886 52.35704 13.54886 52.67704
Hamburg 9.840654 53.390341 10.16065 53.71034
München 11.36078 48.06162 11.72291 48.24812
...
Here is the data:
stadte_namen<-c("Berlin", "Hamburg", "München", "Köln", "Frankfurt am Main",
"Stuttgart")
result<-list(structure(c(13.2288599, 52.3570365, 13.5488599, 52.6770365
), .Dim = c(2L, 2L), .Dimnames = list(c("x", "y"), c("min", "max"
))), structure(c(9.840654, 53.390341, 10.160654, 53.710341), .Dim = c(2L,
2L), .Dimnames = list(c("x", "y"), c("min", "max"))), structure(c(11.360777,
48.0616244, 11.7229083, 48.2481162), .Dim = c(2L, 2L), .Dimnames = list(
c("x", "y"), c("min", "max"))), structure(c(6.7725303, 50.8304399,
7.162028, 51.0849743), .Dim = c(2L, 2L), .Dimnames = list(c("x",
"y"), c("min", "max"))), structure(c(8.4727933, 50.0155435, 8.8004716,
50.2271408), .Dim = c(2L, 2L), .Dimnames = list(c("x", "y"),
c("min", "max"))), structure(c(9.0386007, 48.6920188, 9.3160228,
48.8663994), .Dim = c(2L, 2L), .Dimnames = list(c("x", "y"),
c("min", "max"))))
You can also try:
l <- result
df <- data.frame(t(sapply(l,c)))
colnames(df) <- c("minX", "minY", "maxX", "maxY"); df
df$stadte_namen <- c("Berlin", "Hamburg", "München", "Köln", "Frankfurt am Main",
"Stuttgart");df
Answer:
minX minY maxX maxY stadte_namen
1 13.228860 52.35704 13.548860 52.67704 Berlin
2 9.840654 53.39034 10.160654 53.71034 Hamburg
3 11.360777 48.06162 11.722908 48.24812 München
4 6.772530 50.83044 7.162028 51.08497 Köln
5 8.472793 50.01554 8.800472 50.22714 Frankfurt am Main
6 9.038601 48.69202 9.316023 48.86640 Stuttgart
With lapply and purrr:
library(dplyr)
library(purrr)
data <- lapply(result, function(x) c(xmin = x[1,1],
xmax = x[1,2],
ymin = x[2,1],
ymax = x[2,2])) %>%
purrr::map_dfr(~.x)
data$stadte_namen <- stadte_namen
# A tibble: 6 x 5
xmin xmax ymin ymax stadte_namen
<dbl> <dbl> <dbl> <dbl> <chr>
1 13.2 13.5 52.4 52.7 Berlin
2 9.84 10.2 53.4 53.7 Hamburg
3 11.4 11.7 48.1 48.2 München
4 6.77 7.16 50.8 51.1 Köln
5 8.47 8.80 50.0 50.2 Frankfurt am Main
6 9.04 9.32 48.7 48.9 Stuttgart
Assign stadte_namen as names to result and bind the dataframe together in one dataframe. You can get the data in wide format using pivot_wider.
library(tidyverse)
map_df(setNames(result, stadte_namen), ~.x %>%
as.data.frame %>%
rownames_to_column('row'), .id = 'name') %>%
pivot_wider(names_from = row, values_from = c(min, max))
# name min_x min_y max_x max_y
# <chr> <dbl> <dbl> <dbl> <dbl>
#1 Berlin 13.2 52.4 13.5 52.7
#2 Hamburg 9.84 53.4 10.2 53.7
#3 München 11.4 48.1 11.7 48.2
#4 Köln 6.77 50.8 7.16 51.1
#5 Frankfurt am Main 8.47 50.0 8.80 50.2
#6 Stuttgart 9.04 48.7 9.32 48.9
I have a problem I'm trying to solve, and I can't seem to find a succinct solution. There are a few similar questions on SO, but nothing that quite fits.
Take some sample data:
library(dplyr)
dat <- tibble(
group1 = factor(sample(c("one", "two"), 10, replace = T)),
group2 = factor(sample(c("alpha", "beta"), 10, replace = T)),
var1 = rnorm(10, 20, 2),
var2 = rnorm(10, 20, 2),
var3 = rnorm(10, 20, 2),
other1 = sample(c("a", "b", "c"), 10, replace = T),
other2 = sample(c("a", "b", "c"), 10, replace = T),
)
I would like to summarise just the numeric variables (i.e. ignoring other1 and other2), but have the output grouped by group1 and group2.
I have tried something like this, but it returns an error as it attempts to apply my summarise() functions to the grouping variables too.
dat %>%
group_by(group1, group2) %>%
select(where(is.numeric)) %>%
map(~ .x %>%
filter(!is.na(.x)) %>%
summarise(mean = mean(.x),
sd = sd(.x),
median = median(.x),
q1 = quantile(.x, p = .25),
q3 = quantile(.x, p = .75))
)
My expected output would be something like
group1 group2 mean sd median q1 q3
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
1 one alpha ? ? ? ? ?
2 one beta ? ? ? ? ?
3 two alpha ? ? ? ? ?
4 two beta ? ? ? ? ?
Any solutions would be greatly appreciated.
Thanks,
Sam
Try:
dat %>% group_by(group1,group2) %>%
summarize(across(is.numeric,c(sd = sd,
mean = mean,
median =median,
q1 = function(x) quantile(x,.25),
q3 = function(x) quantile(x,.75))))
group1 group2 var1_sd var1_mean var1_median var1_q1 var1_q3 var2_sd var2_mean var2_median var2_q1 var2_q3 var3_sd
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 one alpha 4.06 20.6 19.3 18.3 22.2 1.12 17.9 17.3 17.2 18.2 1.09
2 one beta 0.726 18.7 18.7 18.4 18.9 0.348 18.8 18.8 18.7 18.9 0.604
3 two alpha 1.31 19.9 20.0 19.3 20.6 1.10 17.8 18.3 17.4 18.5 0.624
4 two beta 0.777 21.2 21.2 21.0 21.5 1.13 19.6 19.6 19.2 20.0 0.0161
You can also pass the columns to the functions in summarise:
dat %>%
group_by(group1, group2) %>%
summarise(mean = mean(var1:var3),
sd = sd(var1:var3),
median = median(var1:var3),
q1 = quantile(var1:var3, p = .25),
q3 = quantile(var1:var3, p = .75))
dat
# A tibble: 4 x 7
# Groups: group1 [2]
# group1 group2 mean sd median q1 q3
# <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 one alpha 19.1 0.707 19.1 18.8 19.3
# 2 one beta 17.5 1.29 17.5 16.8 18.3
# 3 two alpha 17.1 NA 17.1 17.1 17.1
# 4 two beta 19.9 NA 19.9 19.9 19.9