dplyr get linear regression coefficients - r

I'm wondering if there is a better way is to get linear regression coefficients as columns in dplyr. Here is some sample data.
mydata <-
data.frame(
Site = c(1,1,1,1,1,1,1,1),
Site1 = c(2,3,2,3,2,3,2,3),
Age = c(17, 52, 19, 18, 62, 53, 41, 24),
Gender = c(1,2,1,1,2,2,2,1),
Outcome = c(1,1,1,1,0,0,0,1)
)
I wrote this helper function to turn summary(.data)$coefficients into columns
GetCoefficients <- function(.data){
AllData <- data.frame()
AllData[1, ] <- ""
col_names <- colnames(summary(.data)$coefficients)
row_names <- rownames(summary(.data)$coefficients)
row_len <- length(row_names)
col_len <- length(col_names)-1
x <- summary(.data)$coefficients
for (i in 1:length(x)){
AllData <- AllData %>%
mutate(!!paste0(row_names[ifelse(i%%row_len != 0, i%%row_len, row_len)],
"_",col_names[ceiling(i/col_len)]) := x[i])
}
return(AllData)
}
Using the helper function I can put coefficients into my data.frame()
Linear_regression <- mydata %>%
pivot_longer(starts_with("Site"),
names_to = ".value",
names_pattern = "(^Site)") %>%
group_by(Site) %>%
do(Reg = lm(Outcome ~ Age + Gender, data = .)) %>%
mutate(rsq = summary(Reg)$r.squared) %>%
mutate(fun = GetCoefficients(Reg))

Here is a combination of tidyverse and broom package to get your desired output.
Very handy here is group_split -> you get a list and then you iterate with purrrs map_dfr (by the way with map_dfr you get a dataframe otherwise with map you get a list) your regression lm(... through each list element. Using brooms glance gives the desired output:
library(tidyverse)
library(broom)
mydata %>%
pivot_longer(starts_with("Site"),
names_to = ".value",
names_pattern = "(^Site)") %>%
mutate(Site=as.factor(Site)) %>%
group_by(Site) %>%
group_split() %>%
map_dfr(.f = function(df){
lm(Outcome ~ Age+Gender, data=df) %>%
glance() %>%
add_column(Site = unique(df$Site), .before = 1)
})
Site r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual nobs
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int>
1 1 0.6 0.44 3.87e- 1 3.75e+ 0 1.01e- 1 2 -1.88 11.8 12.1 7.5 e- 1 5 8
2 2 1 1 2.22e-16 1.01e+31 2.22e-16 2 141. -275. -277. 4.93e-32 1 4
3 3 0.351 -0.946 6.97e- 1 2.71e- 1 8.05e- 1 2 -1.46 10.9 8.47 4.86e- 1 1 4

The function broom::tidy extracts the coefficient estimates as well as their standard errors, t-statistics & p-values into rows.
library("tidyverse")
mydata <- tibble::tribble(
~Age, ~Gender, ~Outcome, ~Site,
17, 1, 1, "1",
17, 1, 1, "2",
52, 2, 1, "1",
52, 2, 1, "3",
19, 1, 1, "1",
19, 1, 1, "2",
18, 1, 1, "1",
18, 1, 1, "3",
62, 2, 0, "1",
62, 2, 0, "2",
53, 2, 0, "1",
53, 2, 0, "3",
41, 2, 0, "1",
41, 2, 0, "2",
24, 1, 1, "1",
24, 1, 1, "3"
)
mydata %>%
group_by(
Site
) %>%
group_modify(
~ broom::tidy(lm(Outcome ~ Age + Gender, data = .))
)
#> Warning in summary.lm(x): essentially perfect fit: summary may be unreliable
#> # A tibble: 9 × 6
#> # Groups: Site [3]
#> Site term estimate std.error statistic p.value
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 1 (Intercept) 1.75e+ 0 5.37e- 1 3.26e+ 0 2.25e- 2
#> 2 1 Age -9.13e-18 2.44e- 2 -3.74e-16 1 e+ 0
#> 3 1 Gender -7.50e- 1 8.40e- 1 -8.92e- 1 4.13e- 1
#> 4 2 (Intercept) 2 e+ 0 4.20e-16 4.76e+15 1.34e-16
#> 5 2 Age 9.08e-18 1.49e-17 6.10e- 1 6.51e- 1
#> 6 2 Gender -1 e+ 0 5.46e-16 -1.83e+15 3.48e-16
#> 7 3 (Intercept) 1.22e+ 0 2.03e+ 0 6.00e- 1 6.56e- 1
#> 8 3 Age -2.70e- 2 1.62e- 1 -1.67e- 1 8.95e- 1
#> 9 3 Gender 3.51e- 1 5.16e+ 0 6.82e- 2 9.57e- 1
Sometimes however, we would like to extract only the coefficient estimates into columns.
mydata %>%
group_by(
Site
) %>%
group_modify(
~ bind_rows(coefficients(lm(Outcome ~ Age + Gender, data = .)))
)
#> # A tibble: 3 × 4
#> # Groups: Site [3]
#> Site `(Intercept)` Age Gender
#> <chr> <dbl> <dbl> <dbl>
#> 1 1 1.75 -9.13e-18 -0.750
#> 2 2 2 9.08e-18 -1
#> 3 3 1.22 -2.70e- 2 0.351
Created on 2022-04-20 by the reprex package (v2.0.1)

Related

tidyr mutate new column based on group by with calculation

Using tidyr, how can I create a new column through a group-by and calculation?
For example, if I have this dataframe:
name <- c("a", "a", "a", "a", "b", "b", "b", "b")
x1 <- c(0, 0, 0, 0, 1, 1, 1, 1)
x2 <- c(15, 15, 15, 15, 15, 15, 15, 15)
y <- c(1, 2, 1, 2, 1, 2, 1, 2)
z <- c(50, 100, 40, 90, 65, 95, 40, 95)
df <- data.frame(name, x1, x2, y, z)
Let's say I want to (1) group-by x1 and x2; (2) find the max z value in that group; and (3) create a new column z2 that normalized z by that maximum.
So in this case, the expected output for z2 is c(0.5, 1, 0.4, 0.9, 0.684, 1, 0.421, 1).
We could simply group by 'x1', 'x2' and create the column with mutate
library(dplyr)
df <- df %>%
group_by(x1, x2) %>%
mutate(z2 = (z/max(z, na.rm = TRUE))) %>%
ungroup
-output
df
# A tibble: 8 × 6
name x1 x2 y z z2
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 a 0 15 1 50 0.5
2 a 0 15 2 100 1
3 a 0 15 1 40 0.4
4 a 0 15 2 90 0.9
5 b 1 15 1 65 0.684
6 b 1 15 2 95 1
7 b 1 15 1 40 0.421
8 b 1 15 2 95 1

finding difference of a column row-by-row in R

I have a subset of data as below:
structure(list(id = c(100, 101, 102, 103, 104, 105),
`family id` = c(1,1, 2, 2, 3, 3),
disease = c(1, 0, 0, 1, 1, 0),
val = c("3.1", "6.2", "2.45", "7.77", "4.56", "2.1")),
class = c("tbl_df", "tbl","data.frame"), row.names = c(NA, -6L))
I want to find the difference: value of sibling with disease(1) - value of sibling with no disease(0)?
the output should be as below:
Adding a helper id column and using tidyr::pivot_wider you could do:
library(dplyr)
library(tidyr)
df |>
group_by(`family id`) |>
mutate(id1 = row_number(), val = as.numeric(val)) |>
ungroup() |>
pivot_wider(names_from = id1, values_from = -c(id1, `family id`), names_sep = "") |>
mutate(difference = ifelse(disease1 == 1, val1 - val2, val2 - val1))
#> # A tibble: 3 × 8
#> `family id` id1 id2 disease1 disease2 val1 val2 difference
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 100 101 1 0 3.1 6.2 -3.1
#> 2 2 102 103 0 1 2.45 7.77 5.32
#> 3 3 104 105 1 0 4.56 2.1 2.46

"Pivot longer" all columns in single-row data frame into two "values" columns

Title is complicated, but I don't know how to put this problem into words. So I'll demonstrate.
Here's my problem, with the desired output:
library(tibble)
# Input:
tribble(
~n_1, ~n_2, ~n_3, ~pct_1, ~pct_2, ~pct_3,
10, 20, 30, 0.1, 0.2, 0.3
)
#> # A tibble: 1 x 6
#> n_1 n_2 n_3 pct_1 pct_2 pct_3
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 10 20 30 0.1 0.2 0.3
# Desired output:
tribble(
~name, ~n, ~pct,
1, 10, 0.1,
2, 20, 0.2,
3, 30, 0.3
)
#> # A tibble: 3 x 3
#> name n pct
#> <dbl> <dbl> <dbl>
#> 1 1 10 0.1
#> 2 2 20 0.2
#> 3 3 30 0.3
I tried tidyr::pivot_longer(), but I can't get it right. Is there any way?
One option could be:
df %>%
pivot_longer(everything(),
names_to = c(".value", "name"),
names_pattern = "(.*)_(.)")
name n pct
<chr> <dbl> <dbl>
1 1 10 0.1
2 2 20 0.2
3 3 30 0.3
Try this approach. As your main variable is concatenated you can use separate() (using sep='_') after pivot_longer() and then pivot_wider() to obtain the expected dataframe. Here the code:
library(tidyverse)
#Code
df %>% pivot_longer(cols = everything()) %>%
separate(name,into = c('var','name'),sep = '_') %>%
pivot_wider(names_from = var,values_from=value)
Output:
# A tibble: 3 x 3
name n pct
<chr> <dbl> <dbl>
1 1 10 0.1
2 2 20 0.2
3 3 30 0.3
Some data used (the one you provided):
#Data
df <- structure(list(n_1 = 10, n_2 = 20, n_3 = 30, pct_1 = 0.1, pct_2 = 0.2,
pct_3 = 0.3), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame"))

filter infinite values and NAs in same call using dplyr::c_across and filter_if

I'm looking to filter dataframe rows with Inf and NA in the same call using filter with c_across and deprecated filter_if:
library(dplyr)
df <- tibble(a = c(1, 2, 3, NA, 1), b = c(5, Inf, 8, 8, 3), c = c(9, 10, Inf, 11, 12), d = c('a', 'b', 'c', 'd', 'e'), e = c(1, 2, 3, 4, -Inf))
# # A tibble: 5 x 5
# a b c d e
# <dbl> <dbl> <dbl> <chr> <dbl>
# 1 1 5 9 a 1
# 2 2 Inf 10 b 2
# 3 3 8 Inf c 3
# 4 NA 8 11 d 4
# 5 1 3 12 e -Inf
I could do this in two calls using either c_across or filter_if:
df %>%
rowwise %>%
filter(!any(is.infinite(c_across(where(is.numeric))))) %>%
filter(!any(is.na(c_across(where(is.numeric)))))
# # A tibble: 1 x 5
# # Rowwise:
# a b c d e
# <dbl> <dbl> <dbl> <chr> <dbl>
# 1 1 5 9 a 1
#OR filter_if:
df %>%
filter_if(~is.numeric(.), all_vars(!is.infinite(.))) %>%
filter_if(~is.numeric(.), all_vars(!is.na(.)))
# # A tibble: 1 x 5
# a b c d e
# <dbl> <dbl> <dbl> <chr> <dbl>
# 1 1 5 9 a 1
How would I do both approaches in one call to filter (and filter_if)? There may be an across approach too?
thanks
Try this. Use the where to identify your numeric columns.
df %>%
filter(across(.cols = where(is.numeric),
.fns = ~!is.infinite(.x) & !is.na(.x)))
I would suggest an approach with across() from dplyr:
library(dplyr)
#Data
df <- tibble(a = c(1, 2, 3, NA, 1),
b = c(5, Inf, 8, 8, 3),
c = c(9, 10, Inf, 11, 12),
d = c('a', 'b', 'c', 'd', 'e'),
e = c(1, 2, 3, 4, -Inf))
#Mutate
df %>% filter(across(c(a:e), ~ !is.na(.) & !is.infinite(.)))
Output:
# A tibble: 1 x 5
a b c d e
<dbl> <dbl> <dbl> <chr> <dbl>
1 1 5 9 a 1

How to loop data in R?

Here is a piece of my data:
data_x <- tribble(
~price, ~bokey, ~id, ~cost, ~revenue,
1, "a", 10, 0.20, 30,
2, "b", 20, 0.30, 60,
3, "c", 20, 0.30, 40,
4, "d", 10, 0.20, 100,
5, "e", 30, 0.10, 40,
6, "f", 10, 0.20, 10,
1, "g", 20, 0.30, 80,
2 , "h", 10, 0.20, 20,
3, "h", 30, 0.10, 20,
3, "i", 20, 0.30, 40,
)
As you see, there are three different type of IDs: 10, 20, 30. But in the real data, there are almost 100 ids. I want to aggregate the data based on these ids. Because I don't know how to do it in loop, I basically created some subsets:
data_10 <- data_x %>% filter(id == 10)
data_20 <- data_x %>% filter(id == 20)
data_30 <- data_x %>% filter(id == 30)
Here is the aggregated data:
data_agg <- data_10 %>%
group_by(priceseg = cut(as.numeric(price), c(0, 1, 3, 5, 6))) %>%
summarise(price_n = n_distinct(bokey),
Cost = sum(cost, na.rm = T),
Revenue = sum(revenue, na.rm = T),
clicks = n_distinct(bokey)) %>%
mutate(price_n2 = round(100 * prop.table(price_n), 2),
(zet = Cost/Revenue))
But I want to have one more column that shows the id. Here is the desired data:
data_desired <- tribble(
~id, ~priceseg, ~price_n, ~Cost, ~Revenue, ~clicks, ~price_n2, ~`(zet = Cost/Revenue)`
10, (0,1] 1 0.2 30 1 25 0.00667
10, (1,3] 1 0.2 20 1 25 0.01
10, (3,5] 1 0.2 100 1 25 0.002
10, (5,6] 1 0.2 10 1 25 0.02
20,
20,
.
.
) 30,
How can I get it?
Since you are already using dplyr, just add id as one of the grouping variables (no need to previously separate your data):
data_agg <- data_x %>%
group_by(id, priceseg = cut(as.numeric(price), c(0, 1, 3, 5, 6))) %>%
summarise(price_n = n_distinct(bokey),
Cost = sum(cost, na.rm = T),
Revenue = sum(revenue, na.rm = T),
clicks = n_distinct(bokey)) %>%
mutate(price_n2 = round(100 * prop.table(price_n), 2),
(zet = Cost/Revenue))
# A tibble: 8 x 8
# Groups: id [3]
# id priceseg price_n Cost Revenue clicks price_n2 `(zet = Cost/Revenue)`
# <dbl> <fct> <int> <dbl> <dbl> <int> <dbl> <dbl>
# 1 10 (0,1] 1 0.2 30 1 25 0.00667
# 2 10 (1,3] 1 0.2 20 1 25 0.01
# 3 10 (3,5] 1 0.2 100 1 25 0.002
# 4 10 (5,6] 1 0.2 10 1 25 0.02
# 5 20 (0,1] 1 0.3 80 1 25 0.00375
# 6 20 (1,3] 3 0.900 140 3 75 0.00643
# 7 30 (1,3] 1 0.1 20 1 50 0.005
# 8 30 (3,5] 1 0.1 40 1 50 0.0025
An option is to split and loop over with map while specifying the .id
library(dplyr)
library(purrr)
data_x %>%
split(.$id) %>%
map_dfr(~
.x %>%
group_by(priceseg = cut(as.numeric(price), c(0, 1, 3, 5, 6))) %>%
summarise(price_n = n_distinct(bokey),
Cost = sum(cost, na.rm = T),
Revenue = sum(revenue, na.rm = T),
clicks = n_distinct(bokey)) %>%
mutate(price_n2 = round(100 * prop.table(price_n), 2),
(zet = Cost/Revenue)), .id = "id" )
# A tibble: 8 x 8
# id priceseg price_n Cost Revenue clicks price_n2 `(zet = Cost/Revenue)`
# <chr> <fct> <int> <dbl> <dbl> <int> <dbl> <dbl>
#1 10 (0,1] 1 0.2 30 1 25 0.00667
#2 10 (1,3] 1 0.2 20 1 25 0.01
#3 10 (3,5] 1 0.2 100 1 25 0.002
#4 10 (5,6] 1 0.2 10 1 25 0.02
#5 20 (0,1] 1 0.3 80 1 25 0.00375
#6 20 (1,3] 3 0.900 140 3 75 0.00643
#7 30 (1,3] 1 0.1 20 1 50 0.005
#8 30 (3,5] 1 0.1 40 1 50 0.0025
The cut step can also be changed with findInterval
NOTE: The idea of split/map is based on the OP's title about looping and getting the output

Resources