Short code to extract fitted value from tbl_df objects - r

I have a data set containing groups of data and I performed regression on on each group of data. I used dplyr to do the regression and get a tbl_df object with all results. Then I want to extract fitted value vector for each group of regression and put them in a data frame. I used to use summarise() to extract relevant information conveniently. But it only works for scalars. Here is some sample code with lapply I used to extract the information and I feel it kind of cumbersome:
library(dplyr)
library(reshape2)
df1 = data.frame(type1 = c(rep('a',5),rep('b',5)),
x = 1:10,
y = 11:20)
df1 %>%
group_by(type1) %>%
do(model = lm(y~x,.)) -> model1
names(model1$model) = model1$type1
lapply(model1$model,function(mod) mod$fit) %>%
melt

library(broom)
model1 %>% augment(model)
# A tibble: 10 x 10
# Groups: type1 [2]
type1 y x .fitted .se.fit .resid .hat .sigma .cooksd .std.resid
<fctr> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 a 11 1 11 2.482534e-16 3.567051e-19 0.6 3.925229e-16 2.322633e-06 0.001759785
2 a 12 2 12 1.755417e-16 3.026750e-16 0.3 2.977199e-16 2.730293e-01 1.128776594
3 a 13 3 13 1.433292e-16 -3.857170e-16 0.2 2.471607e-16 2.263176e-01 -1.345563357
4 a 14 4 14 1.755417e-16 -1.380180e-16 0.3 3.747906e-16 5.677113e-02 -0.514715401
5 a 15 5 15 2.482534e-16 2.207032e-16 0.6 3.052655e-16 8.891591e-01 1.088827560
6 b 16 6 16 1.709167e-15 -2.416065e-15 0.6 8.008132e-17 2.248024e+00 -1.731290167
7 b 17 7 17 1.208563e-15 2.359219e-15 0.3 1.824137e-15 3.499565e-01 1.277939838
8 b 18 8 18 9.867878e-16 1.265324e-15 0.2 2.510473e-15 5.138141e-02 0.641132787
9 b 19 9 19 1.208563e-15 5.595623e-17 0.3 2.702016e-15 1.968677e-04 0.030310330
10 b 20 10 20 1.709167e-15 -1.264434e-15 0.6 2.303179e-15 6.157097e-01 -0.906060815

Use the tidyverse package, which contains dplyr, purrr, tidyr
library(tidyverse)
Use nest and map
df1 %>%
group_by(type1) %>%
nest() %>%
mutate(data = map(data, ~lm(y~x,.x)$fit)) %>% # combined lm with $fit
unnest()
Output
type1 data
1 a 11
2 a 12
3 a 13
4 a 14
5 a 15
6 b 16
# etc

We can use modelr together with tidyverse. The add_predictions function is handy. Here is an example.
# Load package
library(tidyverse)
library(modelr)
# Create example data frame
df1 = data.frame(type1 = c(rep('a',5),rep('b',5)),
x = 1:10,
y = 11:20)
# Created nested data frame
df2 <- df1 %>%
group_by(type1) %>%
nest()
# A function to fit lm model to y ~ x
lm_model <- function(df) {
lm(y ~ x, data = df)
}
# Fit model
df3 <- df2 %>%
mutate(model = map(data, lm_model))
# Add prediction
df4 <- df3 %>%
mutate(Pred = map2(data, model, add_predictions))
# Unnest the data frame
df5 <- df4 %>% unnest(Pred)
df5
# A tibble: 10 x 4
type1 x y pred
<fctr> <int> <int> <dbl>
1 a 1 11 11
2 a 2 12 12
3 a 3 13 13
4 a 4 14 14
5 a 5 15 15
6 b 6 16 16
7 b 7 17 17
8 b 8 18 18
9 b 9 19 19
10 b 10 20 20

I'd like provide a slick answer using data.table package:
library(data.table)
df1 %>%
data.table %>%
.[,
.(x,
y,
fit = lm(y~x)$fit),
by = type1]

Related

Apply t test over all columns of data frame seperated by variable [duplicate]

This question already has an answer here:
dplyr summarize across ttest
(1 answer)
Closed 7 months ago.
I have data frame like this
X1 X2 X3 X4 X5 class
1 1 7 3 9 5 n
2 2 8 4 10 6 n
3 3 9 5 1 7 n
4 4 10 6 2 8 p
5 5 1 7 3 9 p
6 6 2 8 4 10 p
I like to run t test over all columns, separated on groups formed by the variable class.
I know I can use for loop for this, but I wonder if there is better code for that.
m1 <- data.frame(matrix(c <- (1:10), nrow = 6, ncol = 5))
m1 <- data.frame(m1,c("n","n","n","p","p","p"))
names(m1)[6] = "class"
# work for one column
t.test(X1~class, data= m1)
# What I'm looking for
# t.test(X_i~class, data= m1)
library(dplyr)
library(tidyr)
library(broom)
df <- data.frame(
x1 = 0:9,
x2 = 10:19,
x3 = 20:29,
class = rep(c("a", "b"), each = 5)
)
# Conduct tests and store in nested data.frame
nested <- df %>%
group_by(class) %>%
summarise(across(everything(), ~ list(
t.test(.) %>%
tidy() %>%
select(estimate, statistic,
p.value, conf.low, conf.high)
)))
# Unnest and turn into long structure
long <- nested %>%
unnest(cols = starts_with("x"), names_sep = "_") %>%
pivot_longer(cols = starts_with("x"),
names_to = "quantity",
values_to = "value")
# Split variables into multiple columns and widen
long %>%
separate(col = quantity, into = c("variable", "quantity"), sep = "_") %>%
pivot_wider(names_from = "quantity")
#> # A tibble: 6 x 7
#> class variable estimate statistic p.value conf.low conf.high
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 a x1 2 2.83 0.0474 0.0368 3.96
#> 2 a x2 12 17.0 0.0000707 10.0 14.0
#> 3 a x3 22 31.1 0.00000636 20.0 24.0
#> 4 b x1 7 9.90 0.000584 5.04 8.96
#> 5 b x2 17 24.0 0.0000178 15.0 19.0
#> 6 b x3 27 38.2 0.00000281 25.0 29.0

How to pass multiple functions with multiple arguments using purr::map?

I'm trying to evaluate multiple functions on my dataset using purr library. I tried spliting the data and then apply map with the functions list but I don't seem to find the right way to introduce my varaibles as arguments. What I am missing?
library(tidyverse)
df <- tibble(f1 = c('a','a','a','b','b','c','c','c','d','d'),
f2 = c('z','x','y','z','x','z','x','y','z','x'),
obs = sample(1:10, 10),
pred = sample(1:10,10))
#RMSE
#------------------------------------------------------------------------------
rmse <- function(obs, model){
n = length(obs)
diff = (model - obs)**2
rmse = sqrt(sum(diff)) / n
return(rmse)
}
#Mean Bias
#------------------------------------------------------------------------------
mean_bias <- function(obs, model){
n =length(obs)
diff=model-obs
mean_bias=sum(diff)/n
return(mean_bias)
}
#Mean error
#------------------------------------------------------------------------------
mean_error <- function(obs, model){
n =length(obs)
abs=abs(model-obs)
mean_error=sum(abs)/n
return(mean_error)
}
mod_eval <- list(rmse, mean_bias,mean_error)
test <- df%>%
split(list(c(.$f1,.$f2)))%>%
invoke_map(.f=mod_eval)##????
I'll expect to get something like this
f1 f2 rmse mean_bias mean_error
1 a x 17 11 5
2 b x 17 11 5
3 c x 17 11 5
4 d x 17 11 5
5 a y 17 11 5
6 b y 17 11 5
You can pass a named list of functions and use map to apply each function to each combination.
library(dplyr)
library(purrr)
mod_eval <- lst(rmse, mean_bias,mean_error)
df %>%
split(.$f1) %>%
map_df(function(x) map_dbl(mod_eval, ~.x(x$obs, x$pred)), .id = 'f1')
# f1 rmse mean_bias mean_error
# <chr> <dbl> <dbl> <dbl>
#1 a 2.33 -1.67 3.67
#2 b 3.91 5.5 5.5
#3 c 2.92 -2.33 4.33
#4 d 2.5 0.5 3.5
We could use tidyverse approaches
library(dplyr)
library(tidyr)
mod_eval <- dplyr::lst(rmse, mean_bias, mean_error)
df %>%
nest_by(f1) %>%
crossing(mod_eval) %>%
mutate(nm1 = names(mod_eval)) %>%
rowwise %>%
transmute(f1, nm1, out = list(mod_eval(data$obs, data$pred))) %>%
unnest(out) %>%
pivot_wider(names_from = nm1, values_from = out)
-output
f1 rmse mean_bias mean_error
<chr> <dbl> <dbl> <dbl>
1 a 1.80 0.333 3
2 b 3.35 -1.5 4.5
3 c 3.09 4 4.67
4 d 4.53 -5 5
if it is grouped by 'f1', 'f2'
df %>%
nest_by(f1, f2) %>%
crossing(mod_eval) %>%
mutate(nm1 = names(mod_eval)) %>%
rowwise %>%
transmute(f1, f2, nm1, out = list(mod_eval(data$obs, data$pred))) %>%
unnest(out) %>%
pivot_wider(names_from = nm1, values_from = out)
-output
# A tibble: 10 x 5
f1 f2 rmse mean_bias mean_error
<chr> <chr> <dbl> <dbl> <dbl>
1 a x 4 -4 4
2 a y 2 2 2
3 a z 3 3 3
4 b x 3 3 3
5 b z 6 -6 6
6 c x 1 -1 1
7 c y 7 7 7
8 c z 6 6 6
9 d x 9 -9 9
10 d z 1 -1 1

How to find max values in each group in R using dplyr

How could I generate a new summary data.frame with only the largest values of a data.frame
I tried the code below, but the result was not what I expected.
n<-c('A','B','C','A','B','C','A','B','C')
x1<-c(1,2,3,4,5,6,7,8,9)
x2<-c(9,8,7,4,5,6,1,2,3)
df<-data.frame(n, x1, x2)
df%>%
group_by(n)%>%
mutate('x1'=max('x1'), 'x2'=max('x2'))
i wanted this
n X1 X2
A 7 9
B 8 8
C 9 7
Does this work:
library(dplyr)
df %>% group_by(n ) %>% summarise(across(x1:x2, max))
# A tibble: 3 x 3
n x1 x2
<chr> <dbl> <dbl>
1 A 7 9
2 B 8 8
3 C 9 7
We can use across with group_by
library(dplyr)
df %>%
group_by(n) %>%
summarise(across(everything(), max, na.rm = TRUE))
-output
# A tibble: 3 x 3
n x1 x2
<chr> <dbl> <dbl>
1 A 7 9
2 B 8 8
3 C 9 7
Or use aggregate in base R
aggregate(.~ n, df, max)
Just change it to x1 = max(x1) without the quotes and change mutate into summarise.
df%>%
group_by(n)%>%
summarise(x1=max(x1), x2=max(x2))
A data.table option
library(data.table)
setDT(df)[, Map(max, .SD), n]
gives
n x1 x2
1: A 7 9
2: B 8 8
3: C 9 7

mutate or summarise across rows by variable containing string

I'd like to create a new data table which is the sum across rows from variables which contain a string. I have been trying to keep this within the tidyverse as a noob using new dplyr across. Help much appreciated.
dat<- data.frame("Image" = c(1,2,3,4),
"A" = c(1,2,3,4),
"A:B"= c(5,6,7,8),
"A:B:C"= c(9,10,11,12))
to obtain the sums across the rows of variables containing "A", "B", or "C".
datsums<- data.frame("Image" = c(1,2,3,4),
"Asum"= c(15,18,21,24),
"Bsum"=c(14,16,18,20),
"Csum"=c(9,10,11,12))
I have been unsuccessful using the newer dplyr verbs:
datsums<- dat %>% summarise(across(str_detect("A")), sum, .names ="Asum",
across(str_detect("B")), sum, .names="Bsum",
across(str_detect("C")), sum, .names"Csum")
use rowwise and c_across:
library(tidyverse)
dat %>%
rowwise() %>%
summarise(
Asum = sum(c_across(contains("A"))),
Bsum = sum(c_across(contains("B"))),
Csum = sum(c_across(contains("C")))
)
Returns:
`summarise()` ungrouping output (override with `.groups` argument)
# A tibble: 4 x 3
Asum Bsum Csum
<dbl> <dbl> <dbl>
1 16 14 9
2 20 16 10
3 24 18 11
4 28 20 12
To add columns to the original data.frame, use mutate instead of summarise:
dat %>%
rowwise() %>%
mutate(
Asum = sum(c_across(contains("A"))),
Bsum = sum(c_across(contains("B"))),
Csum = sum(c_across(contains("C")))
)
# A tibble: 4 x 7
# Rowwise:
Image A A.B A.B.C Asum Bsum Csum
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 5 9 16 14 9
2 2 2 6 10 20 16 10
3 3 3 7 11 24 18 11
4 4 4 8 12 28 20 12
Since you want row-wise sum you could use :
library(dplyr)
dat %>%
transmute(Asum = rowSums(select(., contains('A', ignore.case = FALSE))),
Bsum = rowSums(select(., contains('B', ignore.case = FALSE))),
Csum = rowSums(select(., contains('C', ignore.case = FALSE))))
Or for many variables use :
cols <- c('A', 'B', 'C')
purrr::map_dfc(cols, ~dat %>%
transmute(!!paste0(.x, 'sum') :=
rowSums(select(., contains(.x, ignore.case = FALSE)))))
# Asum Bsum Csum
#1 15 14 9
#2 18 16 10
#3 21 18 11
#4 24 20 12
use pivot_longer and pivot_wider
library(tidyverse)
dat %>%
pivot_longer(-Image) %>%
separate_rows(name, sep = "\\.") %>%
pivot_wider(Image,
names_from = name,
values_from = value,
values_fn = sum,
names_prefix = "sum")
#> # A tibble: 4 x 4
#> Image sumA sumB sumC
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 15 14 9
#> 2 2 18 16 10
#> 3 3 21 18 11
#> 4 4 24 20 12
Created on 2020-12-07 by the reprex package (v0.3.0)

Compute percentage of rows in group that have a certain value in another column

I am using the dataset birthwt.
For each age, I want to find the percentage of mothers that are white. My end goal is to display that percentage in a plot by age. How can I do this? I'm learning how to use tidyverse functions so I would prefer to do it that way if possible. Here is my work so far:
library(tidyverse)
library(tidyselect)
library("MASS")
grouped <- birthwt %>%
count(race, age) %>%
spread(key = race, value = n, fill = 0)
grouped
This gets a table where each row represents an age, and there is a column for each race representing the count of mothers of that age. This approach may or may not be on the right path.
We can group by 'age' and get the mean of logical vector
library(dplyr)
birthwt %>%
group_by(age) %>%
summarise(perc = mean(race == 1))
# A tibble: 24 x 2
# age perc
# <int> <dbl>
# 1 14 0.333
# 2 15 0.333
# 3 16 0.286
# 4 17 0.25
# 5 18 0.6
# 6 19 0.625
# 7 20 0.333
# 8 21 0.417
# 9 22 0.769
#10 23 0.308
# … with 14 more rows
Or an option with data.table
library(data.table)
setDT(birthwt)[, .(perc = mean(race == 1)), age]
Or using base R
birthwt$perc <- with(birthwt, ave(race == 1, age))
Or another base R option is
with(birthwt, tapply(race == 1, age, FUN = mean))
Or with aggregate
aggregate(cbind(perc = race == 1) ~ age, birthwt, FUN = mean)
Or with by
by(birthwt$race == 1, birthwt$age, FUN = mean)
We can count the number of race which are white for each age and divide it by total number of rows for each age to get ratio.
library(dplyr)
birthwt %>%
group_by(age) %>%
summarise(perc = sum(race == 1)/n())
# A tibble: 24 x 2
# age perc
# <int> <dbl>
# 1 14 0.333
# 2 15 0.333
# 3 16 0.286
# 4 17 0.25
# 5 18 0.6
# 6 19 0.625
# 7 20 0.333
# 8 21 0.417
# 9 22 0.769
#10 23 0.308
# … with 14 more rows
In base R, we can use aggregate following the same logic
aggregate(race~age, birthwt,function(x) sum(x == 1)/length(x))
Or something similar to your approach using table, we could do
tab <- table(birthwt$age, birthwt$race)
tab[, "1"]/rowSums(tab)

Resources