Apply function on dataframe by specific group in R - r

I have a dataframe that looks something like this:
dist id daytime season
3 1.11 Name1 day summer
4 2.22 Name2 night spring
5 3.33 Name1 day winter
6 4.44 Name3 night fall
I want of summary of distby some specific columns in my dataframe.
So far I used a custom function:
summary <- function(x){df %>%
group_by(x) %>%
summarize(min = min(dist),
q1 = quantile(dist, 0.25),
median = median(dist),
mean = mean(dist),
q3 = quantile(dist, 0.75),
max = max(dist))}
And applied it to any specific column I wanted at the moment:
summary_ID <- path.summary(id)
I tried it a few weeks ago and would get something like this>
id min q1 median mean q3 max
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Name1 0 17.8 310. 788. 1023. 5832.
2 Name2 0 31.7 284. 570. 744. 9578.
3 Name3 0 17.0 325. 721. 1185. 5293.
4 Name4 0 11.9 197. 530. 865. 3476.
5 Name5 0 24.5 94.9 617. 966. 9567.
When I try it now I get an error:
Error in `group_by()`:
! Must group by variables found in `.data`.
✖ Column `x` is not found.
What changed and how do I get around the issue?

Here, we may use {{}} if the input is unquoted
path_summary <- function(dat, x){
dat %>%
group_by({{x}}) %>%
summarize(min = min(dist),
q1 = quantile(dist, 0.25),
median = median(dist),
mean = mean(dist),
q3 = quantile(dist, 0.75),
max = max(dist))
}
-testing
> path_summary(df, id)
# A tibble: 3 × 7
id min q1 median mean q3 max
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Name1 1.11 1.66 2.22 2.22 2.78 3.33
2 Name2 2.22 2.22 2.22 2.22 2.22 2.22
3 Name3 4.44 4.44 4.44 4.44 4.44 4.44
data
df <- structure(list(dist = c(1.11, 2.22, 3.33, 4.44), id = c("Name1",
"Name2", "Name1", "Name3"), daytime = c("day", "night", "day",
"night"), season = c("summer", "spring", "winter", "fall")),
class = "data.frame", row.names = c("3",
"4", "5", "6"))

Related

R pivot longer with both annual and monthly data

I'm unsure how to structure my pivot longer command when I have both annual and monthly data. For example I have:
wide <- data.frame(region_name = character(), # Create empty data frame
total_population_2019 = numeric(),
total_population_2020 = numeric(),
mean_temperature_2019_1 = numeric(),
mean_temperature_2019_2 = numeric(),
mean_temperature_2020_1 = numeric(),
mean_temperature_2020_2 = numeric(),
stringsAsFactors = FALSE)
wide[1, ] <- list("funville", 50000, 51250, 26.3, 24.6, 25.7, 24.9)
region_name total_population_2019 total_population_2020 mean_temperature_2019_1 mean_temperature_2019_2 mean_temperature_2020_1 mean_temperature_2020_2
funville 50000 51250 26.3 24.6 25.7 24.9
I'm able to pivot on the monthly columns using spread:
long <- pivot_longer(wide, cols = 4:7, names_to = c("layer" ,"year", "month"),
names_pattern = "(.*)_(.*)_?_(.*)") %>%
group_by(layer) %>%
mutate(n = 1:n()) %>%
spread(layer, value) %>%
select(-n)
which gives
region_name total_population_2019 total_population_2020 year month mean_temperature
1 funville 50000 51250 2019 1 26.3
2 funville 50000 51250 2019 2 24.6
3 funville 50000 51250 2020 1 25.7
4 funville 50000 51250 2020 2 24.9
I'd like to now have a population column where the values are attributed for each row/month that falls in that year, ideally would look like:
desired.df <- data.frame(region_name = c("funville", "funville", "funville", "funville"),
year = c("2019", "2019", "2020", "2020"),
month = c("1", "2", "1", "2"),
population = c("50000", "50000", "51250", "51250"),
mean_temperature = c("26.3", "24.6", "25.7", "24.9"))
which gives
region_name year month population mean_temperature
1 funville 2019 1 50000 26.3
2 funville 2019 2 50000 24.6
3 funville 2020 1 51250 25.7
4 funville 2020 2 51250 24.9
Does anyone have a solution? Thanks in advance
One option would be to use the names_pattern argument and the special .value. To make this work I first add a helper month to your population columns. Additionally I use tidyr::fill to fill up the population column:
library(dplyr)
library(tidyr)
wide |>
rename_with(~ paste(.x, 1, sep = "_"), starts_with("total")) |>
pivot_longer(-region_name,
names_to = c(".value", "year", "month"),
names_pattern = "^(.*?)_(\\d+)_(\\d+)$") |>
group_by(year) |>
fill(total_population) |>
arrange(year)
#> # A tibble: 4 × 5
#> # Groups: year [2]
#> region_name year month total_population mean_temperature
#> <chr> <chr> <chr> <dbl> <dbl>
#> 1 funville 2019 1 50000 26.3
#> 2 funville 2019 2 50000 24.6
#> 3 funville 2020 1 51250 25.7
#> 4 funville 2020 2 51250 24.9

Make connections between two datasets

I would like to make a connection between the x and df2 datasets. Notice that the dataset x, I have a percentage value, which in this case for the day 03-01-2021 is 0.1 and for the days 01-02-2021 and 01-01-2022 it is 0.45. So from that information, I know the percentage value for 03-01-2021 is 0.1, so this value falls into category I of my dataset df2 (since the values range from 0.1 to 0.2). As for the days 02-01-2021 and 01-01-2022, they correspond to category F of the df2,since the values range from 0.4 to 0.5. So, I would like to generate an output table as follows:
library(dplyr)
df1<- structure(
list(date2= c("01-01-2022","01-01-2022","03-01-2021","03-01-2021","01-02-2021","01-02-2021"),
Category= c("ABC","CDE","ABC","CDE","ABC","CDE"),
coef= c(5,4,0,2,4,5)),
class = "data.frame", row.names = c(NA, -6L))
x<-df1 %>%
group_by(date2) %>%
summarize(across("coef", sum),.groups = 'drop')%>%
arrange(date2 = as.Date(date2, format = "%d-%m-%Y"))
number<-20
x$Percentage<-x$coef/number
date2 coef Percentage
<chr> <dbl> <dbl>
1 03-01-2021 2 0.1
2 01-02-2021 9 0.45
3 01-01-2022 9 0.45
df2 <- structure(
list(
Category = c("A", "B", "C", "D",
"E", "F", "G", "H", "I", "J"),
From = c(0.9,
0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0),
Until = c(
1,
0.8999,
0.7999,
0.6999,
0.5999,
0.4999,
0.3999,
0.2999,
0.1999,
0.0999
),
`1 Val` = c(
2222,
2017.8,
1793.6,
1621.5,
1522.4,
1457.3,
1325.2,
1229.15,
1223.1,
1177.05
),
`2 Val` = c(3200, 2220, 2560,
2200, 2220, 2080, 1220, 1240, 1720, 1620),
`3 Val` = c(
4665,
4122.5,
3732,
3498.75,
3265.5,
3032.25,
2799,
2682.375,
2565.75,
2449.125
),
`4 Val` = c(
6112,
5222.8,
4889.6,
4224,
4278.4,
3972.8,
3667.2,
3224.4,
3361.6,
3222.8
)
),
row.names = c(NA,-10L),
class = c("tbl_df",
"tbl", "data.frame")
)
Category From Until 1 Val 2 Val 3 Val 4 Val
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 0.9 1 2222 3200 4665 6112
2 B 0.8 0.900 2018 2220 4122 5223
3 C 0.7 0.800 1794 2560 3732 4890
4 D 0.6 0.700 1622 2200 3499 4224
5 E 0.5 0.600 1522 2220 3266 4278
6 F 0.4 0.500 1457 2080 3032 3973
7 G 0.3 0.400 1325 1220 2799 3667
8 H 0.2 0.300 1229 1240 2682 3224
9 I 0.1 0.200 1223 1720 2566 3362
10 J 0 0.0999 1177 1620 2449 3223
Using tidyverse, we do a rowwise on the 'x' dataset, slice the rows of 'df2' where the 'Percentage' falls between the 'From' and 'Until', and unpack the data.frame/tibble column
library(dplyr)
library(tidyr)
x %>%
rowwise %>%
mutate(out = df2 %>%
slice(which(Percentage>= From &
Percentage <= Until)[1]) %>%
select(-(1:3)) ) %>%
ungroup %>%
unpack(out)
-output
# A tibble: 3 × 7
date2 coef Percentage `1 Val` `2 Val` `3 Val` `4 Val`
<chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 03-01-2021 2 0.1 1223. 1720 2566. 3362.
2 01-02-2021 9 0.45 1457. 2080 3032. 3973.
3 01-01-2022 9 0.45 1457. 2080 3032. 3973.
Or this could be done with a non-equi join
library(data.table)
nm1 <- names(df2)[endsWith(names(df2), 'Val')]
setDT(x)[setDT(df2), (nm1) := mget(nm1),
on = .(Percentage >= From, Percentage <= Until)]
-output
> x
date2 coef Percentage 1 Val 2 Val 3 Val 4 Val
1: 03-01-2021 2 0.10 1223.1 1720 2565.75 3361.6
2: 01-02-2021 9 0.45 1457.3 2080 3032.25 3972.8
3: 01-01-2022 9 0.45 1457.3 2080 3032.25 3972.8

Whitespaces appear when combining paste0 and format in R

To display the results of a regression I ran, I've got a tibble with estimates and corresponding confidence intervals:
library(tidyverse)
library(magrittr
mydata <- structure(list(term = structure(c(1L, 3L, 4L), .Label = c("Intercept",
"Follow-up time (years)", "Age (years)", "Sex (male)", "Never smoker (reference)",
"Current smoker", "Former smoker", "Obesity (=30 kg/m²)", "BMI (kg/m²)",
"Diabetes", "Glucose (mmol/L)", "Glucose lowering medication use",
"Hypertension", "Systolic blood pressure (mmHg)", "Diastolic blood pressure (mmHg)",
"Antihypertensive medication use", "Hypercholesterolemia", "LDL cholesterol (mmol/L)",
"Lipid lowering medication use", "Chronic kidney disease (mL/min/1.73m²)",
"=90 (reference)", "60-89", "=60"), class = c("ordered", "factor"
)), estimate = c(518.38, 0.98, 1.07), conf_low = c(178.74, 0.93,
0.96), conf_high = c(1503.36, 1.03, 1.19), label = c("518.38 (178.74-1503.36)",
" 0.98 ( 0.93- 1.03)", " 1.07 ( 0.96- 1.19)")), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame"))
mydata
# A tibble: 3 x 4
term estimate conf_low conf_high
<ord> <dbl> <dbl> <dbl>
1 Intercept 518. 179. 1503.
2 Age (years) 0.98 0.93 1.03
3 Sex (male) 1.07 0.96 1.19
To make a label that includes the estimate and 95%CI, I've used paste0, and to make sure that every number has two decimals I've used format. However, when combining these, extra whitespaces appear:
mydata <-
mydata %>%
mutate(
label=
paste0(format(round(estimate, digits=2), nsmall=2),
" (",
format(round(conf_low, digits=2), nsmall=2),
"-",
format(round(conf_high, digits=2), nsmall=2),
")",
sep="", collaps=""))
mydata
# A tibble: 3 x 5
term estimate conf_low conf_high label
<ord> <dbl> <dbl> <dbl> <chr>
1 Intercept 518. 179. 1503. "518.38 (178.74-1503.36)"
2 Age (years) 0.98 0.93 1.03 " 0.98 ( 0.93- 1.03)"
3 Sex (male) 1.07 0.96 1.19 " 1.07 ( 0.96- 1.19)"
Why does this happen? Can I prevent this or otherwise remove the whitespaces so that the format becomes "estimate (conf_low-conf_high)"?
Add trim=TRUE in the format() call:
mydata %>%
mutate(
label=
paste0(format(round(estimate, digits=2), nsmall=2, trim=TRUE),
" (",
format(round(conf_low, digits=2), nsmall=2, trim=TRUE),
"-",
format(round(conf_high, digits=2), nsmall=2, trim=TRUE),
")",
sep="", collaps=""))
# A tibble: 3 × 5
term estimate conf_low conf_high label
<ord> <dbl> <dbl> <dbl> <chr>
1 Intercept 518. 179. 1503. "518.38 (178.74-1503.36)"
2 Age (years) 0.98 0.93 1.03 "0.98 (0.93-1.03)"
3 Sex (male) 1.07 0.96 1.19 "1.07 (0.96-1.19)"
1) Use sprintf
mydata %>%
mutate(label = sprintf("%.2f (%.2f-%.2f)", estimate, conf_low, conf_high))
giving:
# A tibble: 3 x 5
term estimate conf_low conf_high label
<ord> <dbl> <dbl> <dbl> <chr>
1 Intercept 518. 179. 1503. 518.38 (178.74-1503.36)
2 Age (years) 0.98 0.93 1.03 0.98 (0.93-1.03)
3 Sex (male) 1.07 0.96 1.19 1.07 (0.96-1.19)
2) or this variation producing slightly different output
mydata %>%
mutate(label = sprintf("%6.2f (%6.2f-%7.2f)", estimate, conf_low, conf_high))
giving;
# A tibble: 3 x 5
term estimate conf_low conf_high label
<ord> <dbl> <dbl> <dbl> <chr>
1 Intercept 518. 179. 1503. "518.38 (178.74-1503.36)"
2 Age (years) 0.98 0.93 1.03 " 0.98 ( 0.93- 1.03)"
3 Sex (male) 1.07 0.96 1.19 " 1.07 ( 0.96- 1.19)"

Using dplyr summarise() for specific columns within purrr map() with grouped data

I have a problem I'm trying to solve, and I can't seem to find a succinct solution. There are a few similar questions on SO, but nothing that quite fits.
Take some sample data:
library(dplyr)
dat <- tibble(
group1 = factor(sample(c("one", "two"), 10, replace = T)),
group2 = factor(sample(c("alpha", "beta"), 10, replace = T)),
var1 = rnorm(10, 20, 2),
var2 = rnorm(10, 20, 2),
var3 = rnorm(10, 20, 2),
other1 = sample(c("a", "b", "c"), 10, replace = T),
other2 = sample(c("a", "b", "c"), 10, replace = T),
)
I would like to summarise just the numeric variables (i.e. ignoring other1 and other2), but have the output grouped by group1 and group2.
I have tried something like this, but it returns an error as it attempts to apply my summarise() functions to the grouping variables too.
dat %>%
group_by(group1, group2) %>%
select(where(is.numeric)) %>%
map(~ .x %>%
filter(!is.na(.x)) %>%
summarise(mean = mean(.x),
sd = sd(.x),
median = median(.x),
q1 = quantile(.x, p = .25),
q3 = quantile(.x, p = .75))
)
My expected output would be something like
group1 group2 mean sd median q1 q3
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
1 one alpha ? ? ? ? ?
2 one beta ? ? ? ? ?
3 two alpha ? ? ? ? ?
4 two beta ? ? ? ? ?
Any solutions would be greatly appreciated.
Thanks,
Sam
Try:
dat %>% group_by(group1,group2) %>%
summarize(across(is.numeric,c(sd = sd,
mean = mean,
median =median,
q1 = function(x) quantile(x,.25),
q3 = function(x) quantile(x,.75))))
group1 group2 var1_sd var1_mean var1_median var1_q1 var1_q3 var2_sd var2_mean var2_median var2_q1 var2_q3 var3_sd
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 one alpha 4.06 20.6 19.3 18.3 22.2 1.12 17.9 17.3 17.2 18.2 1.09
2 one beta 0.726 18.7 18.7 18.4 18.9 0.348 18.8 18.8 18.7 18.9 0.604
3 two alpha 1.31 19.9 20.0 19.3 20.6 1.10 17.8 18.3 17.4 18.5 0.624
4 two beta 0.777 21.2 21.2 21.0 21.5 1.13 19.6 19.6 19.2 20.0 0.0161
You can also pass the columns to the functions in summarise:
dat %>%
group_by(group1, group2) %>%
summarise(mean = mean(var1:var3),
sd = sd(var1:var3),
median = median(var1:var3),
q1 = quantile(var1:var3, p = .25),
q3 = quantile(var1:var3, p = .75))
dat
# A tibble: 4 x 7
# Groups: group1 [2]
# group1 group2 mean sd median q1 q3
# <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 one alpha 19.1 0.707 19.1 18.8 19.3
# 2 one beta 17.5 1.29 17.5 16.8 18.3
# 3 two alpha 17.1 NA 17.1 17.1 17.1
# 4 two beta 19.9 NA 19.9 19.9 19.9

Divide by last row in mutate in Tidyverse

So this is a relatively simple problem, I have a dataset as below
df <- structure(list(term = c("(Intercept)", "overall_quality", "overall_costs",
"wwpf"), estimate = c(0.388607224137536, 0.456477162621961, 0.485612564501229,
NA), std.error = c(0.499812263278414, 0.0987819420575201, 0.108042289289401,
NA), statistic = c(0.777506381273137, 4.62105879995918, 4.49465267438447,
NA), p.value = c(0.440597919486169, 0.0000279867005591494, 0.0000426773877613654,
NA), average = c(NA, 8.09615384615385, 7.86538461538461, 7.90384615384615
), Elasticity = c(NA, 3.69570933584318, 3.81952959386543, NA)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -4L))
I am trying to use below
df %>% mutate(Elasticity= average*estimate/average[nrow(df)])
Expected output: https://ibb.co/42ptLXx
basically, divide by last row value & since I am trying to incorporate this in function, I need the method to be dynamic & not hard coded value.
Please help !
We can use n() to return the index of last row for subsetting the value of that column
library(dplyr)
df %>%
mutate(Elasticity= average*estimate/average[n()])
If we need a function (using rlang_0.4.0), we can make use {{..}} for evaluation
f1 <- function(dat, col1, col2) {
dat %>%
mutate(Elasticity = {{col1}} * {{col2}}/{{col1}}[n()])
}
f1(df, average, estimate)
# A tibble: 4 x 7
# term estimate std.error statistic p.value average Elasticity
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 (Intercept) 0.389 0.500 0.778 0.441 NA NA
#2 overall_quality 0.456 0.0988 4.62 0.0000280 8.10 0.468
#3 overall_costs 0.486 0.108 4.49 0.0000427 7.87 0.483
#4 wwpf NA NA NA NA 7.90 NA

Resources