Related
I am stuck in performing pivot_longer() over multiple sets of columns. Here is the sample dataset
df <- data.frame(
id = c(1, 2),
uid = c("m1", "m2"),
germ_kg = c(23, 24),
mineral_kg = c(12, 17),
perc_germ = c(45, 34),
perc_mineral = c(78, 10))
I need the output dataframe to look like this
out <- df <- data.frame(
id = c(1, 1, 2, 2),
uid = c("m1", "m1", "m2", "m2"),
crop = c("germ", "germ", "mineral", "mineral"),
kg = c(23, 12, 24, 17),
perc = c(45, 78, 34, 10))
df %>%
rename_with(~str_replace(.x,'(.*)_kg', 'kg_\\1')) %>%
pivot_longer(-c(id, uid), names_to = c('.value', 'crop'), names_sep = '_')
# A tibble: 4 x 5
id uid crop kg perc
<dbl> <chr> <chr> <dbl> <dbl>
1 1 m1 germ 23 45
2 1 m1 mineral 12 78
3 2 m2 germ 24 34
4 2 m2 mineral 17 10
If you were to use data.table:
library(data.table)
melt(setDT(df), c('id', 'uid'), patterns(kg = 'kg', perc = 'perc'))
id uid variable kg perc
1: 1 m1 1 23 45
2: 2 m2 1 24 34
3: 1 m1 2 12 78
4: 2 m2 2 17 10
I suspect there might be a simpler way using pivot_long_spec, but one tricky thing here is that your column names don't have a consistent ordering of their semantic components. #Onyambu's answer deals with this nicely by fixing it upsteam.
library(tidyverse)
df %>%
pivot_longer(-c(id, uid)) %>%
separate(name, c("col1", "col2")) %>% # only needed
mutate(crop = if_else(col2 == "kg", col1, col2), # because name
meas = if_else(col2 == "kg", col2, col1)) %>% # structure
select(id, uid, crop, meas, value) %>% # is
pivot_wider(names_from = meas, values_from = value) # inconsistent
# A tibble: 4 x 5
id uid crop kg perc
<dbl> <chr> <chr> <dbl> <dbl>
1 1 m1 germ 23 45
2 1 m1 mineral 12 78
3 2 m2 germ 24 34
4 2 m2 mineral 17 10
I'm wondering if there is a better way is to get linear regression coefficients as columns in dplyr. Here is some sample data.
mydata <-
data.frame(
Site = c(1,1,1,1,1,1,1,1),
Site1 = c(2,3,2,3,2,3,2,3),
Age = c(17, 52, 19, 18, 62, 53, 41, 24),
Gender = c(1,2,1,1,2,2,2,1),
Outcome = c(1,1,1,1,0,0,0,1)
)
I wrote this helper function to turn summary(.data)$coefficients into columns
GetCoefficients <- function(.data){
AllData <- data.frame()
AllData[1, ] <- ""
col_names <- colnames(summary(.data)$coefficients)
row_names <- rownames(summary(.data)$coefficients)
row_len <- length(row_names)
col_len <- length(col_names)-1
x <- summary(.data)$coefficients
for (i in 1:length(x)){
AllData <- AllData %>%
mutate(!!paste0(row_names[ifelse(i%%row_len != 0, i%%row_len, row_len)],
"_",col_names[ceiling(i/col_len)]) := x[i])
}
return(AllData)
}
Using the helper function I can put coefficients into my data.frame()
Linear_regression <- mydata %>%
pivot_longer(starts_with("Site"),
names_to = ".value",
names_pattern = "(^Site)") %>%
group_by(Site) %>%
do(Reg = lm(Outcome ~ Age + Gender, data = .)) %>%
mutate(rsq = summary(Reg)$r.squared) %>%
mutate(fun = GetCoefficients(Reg))
Here is a combination of tidyverse and broom package to get your desired output.
Very handy here is group_split -> you get a list and then you iterate with purrrs map_dfr (by the way with map_dfr you get a dataframe otherwise with map you get a list) your regression lm(... through each list element. Using brooms glance gives the desired output:
library(tidyverse)
library(broom)
mydata %>%
pivot_longer(starts_with("Site"),
names_to = ".value",
names_pattern = "(^Site)") %>%
mutate(Site=as.factor(Site)) %>%
group_by(Site) %>%
group_split() %>%
map_dfr(.f = function(df){
lm(Outcome ~ Age+Gender, data=df) %>%
glance() %>%
add_column(Site = unique(df$Site), .before = 1)
})
Site r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual nobs
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int>
1 1 0.6 0.44 3.87e- 1 3.75e+ 0 1.01e- 1 2 -1.88 11.8 12.1 7.5 e- 1 5 8
2 2 1 1 2.22e-16 1.01e+31 2.22e-16 2 141. -275. -277. 4.93e-32 1 4
3 3 0.351 -0.946 6.97e- 1 2.71e- 1 8.05e- 1 2 -1.46 10.9 8.47 4.86e- 1 1 4
The function broom::tidy extracts the coefficient estimates as well as their standard errors, t-statistics & p-values into rows.
library("tidyverse")
mydata <- tibble::tribble(
~Age, ~Gender, ~Outcome, ~Site,
17, 1, 1, "1",
17, 1, 1, "2",
52, 2, 1, "1",
52, 2, 1, "3",
19, 1, 1, "1",
19, 1, 1, "2",
18, 1, 1, "1",
18, 1, 1, "3",
62, 2, 0, "1",
62, 2, 0, "2",
53, 2, 0, "1",
53, 2, 0, "3",
41, 2, 0, "1",
41, 2, 0, "2",
24, 1, 1, "1",
24, 1, 1, "3"
)
mydata %>%
group_by(
Site
) %>%
group_modify(
~ broom::tidy(lm(Outcome ~ Age + Gender, data = .))
)
#> Warning in summary.lm(x): essentially perfect fit: summary may be unreliable
#> # A tibble: 9 × 6
#> # Groups: Site [3]
#> Site term estimate std.error statistic p.value
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 1 (Intercept) 1.75e+ 0 5.37e- 1 3.26e+ 0 2.25e- 2
#> 2 1 Age -9.13e-18 2.44e- 2 -3.74e-16 1 e+ 0
#> 3 1 Gender -7.50e- 1 8.40e- 1 -8.92e- 1 4.13e- 1
#> 4 2 (Intercept) 2 e+ 0 4.20e-16 4.76e+15 1.34e-16
#> 5 2 Age 9.08e-18 1.49e-17 6.10e- 1 6.51e- 1
#> 6 2 Gender -1 e+ 0 5.46e-16 -1.83e+15 3.48e-16
#> 7 3 (Intercept) 1.22e+ 0 2.03e+ 0 6.00e- 1 6.56e- 1
#> 8 3 Age -2.70e- 2 1.62e- 1 -1.67e- 1 8.95e- 1
#> 9 3 Gender 3.51e- 1 5.16e+ 0 6.82e- 2 9.57e- 1
Sometimes however, we would like to extract only the coefficient estimates into columns.
mydata %>%
group_by(
Site
) %>%
group_modify(
~ bind_rows(coefficients(lm(Outcome ~ Age + Gender, data = .)))
)
#> # A tibble: 3 × 4
#> # Groups: Site [3]
#> Site `(Intercept)` Age Gender
#> <chr> <dbl> <dbl> <dbl>
#> 1 1 1.75 -9.13e-18 -0.750
#> 2 2 2 9.08e-18 -1
#> 3 3 1.22 -2.70e- 2 0.351
Created on 2022-04-20 by the reprex package (v2.0.1)
I'm taking the mean, 3 by 3, by grouping. For that, I'm using the summarise function. In this context I would like to select the last date from the four that make up the average.
I tried to select the maximum, but this way I'm just selecting the highest date for the whole group.
test = data.frame(my_groups = c("A", "A", "A", "B", "B", "C", "C", "C", "A", "A", "A"),
measure = c(10, 20, 5, 2, 62 ,2, 5, 4, 6, 7, 25),
time= c("20-09-2020", "25-09-2020", "19-09-2020", "20-05-2020", "20-06-2021",
"11-01-2021", "13-01-2021", "13-01-2021", "15-01-2021", "15-01-2021", "19-01-2021"))
# > test
# my_groups measure time
# 1 A 10 20-09-2020
# 2 A 20 25-09-2020
# 3 A 5 19-09-2020
# 4 B 2 20-05-2020
# 5 B 62 20-06-2021
# 6 C 2 11-01-2021
# 7 C 5 13-01-2021
# 8 C 4 13-01-2021
# 9 A 6 15-01-2021
# 10 A 7 15-01-2021
# 11 A 25 19-01-2021
test %>%
arrange(time) %>%
group_by(my_groups) %>%
summarise(mean_3 = rollapply(measure, 3, mean, by = 3, align = "left", partial = F),
final_data = max(time))
# my_groups mean_3 final_data
# <chr> <dbl> <chr>
# 1 A 12.7 25-09-2020
# 2 A 11.7 25-09-2020
# 3 C 3.67 13-01-2021
In the second line I wish the date was 19-01-2021, and not the global maximum of group A, (25-09-2020).
Any hint on how I could do that?
I have 2 dplyr ways for you. Not happy with it because when the rollapply with max and dates doesn't find anything it in group B it uses a double by default which doesn't match the characters from group A and C.
Mutate:
test %>%
arrange(time) %>%
group_by(my_groups) %>%
mutate(final = rollapply(time, 3, max, by = 3, fill = NA, align = "left", partial = F),
mean_3 = rollapply(measure, 3, mean, by = 3, fill = NA, align = "left", partial = F)) %>%
filter(!is.na(final)) %>%
select(my_groups, final, mean_3) %>%
arrange(my_groups)
# A tibble: 3 x 3
# Groups: my_groups [2]
my_groups final mean_3
<chr> <chr> <dbl>
1 A 19-01-2021 12.7
2 A 25-09-2020 11.7
3 C 13-01-2021 3.67
Summarise that doesn't summarise, but is a bit cleaner in code:
test %>%
arrange(time) %>%
group_by(my_groups) %>%
summarise(final = rollapply(time, 3, max, by = 3, fill = NA, align = "left", partial = F),
mean_3 = rollapply(measure, 3, mean, by = 3, fill = NA, align = "left", partial = F)) %>%
filter(!is.na(final))
`summarise()` has grouped output by 'my_groups'. You can override using the `.groups` argument.
# A tibble: 3 x 3
# Groups: my_groups [2]
my_groups final mean_3
<chr> <chr> <dbl>
1 A 19-01-2021 12.7
2 A 25-09-2020 11.7
3 C 13-01-2021 3.67
Edit:
Added isa's solution from comment. Partial = TRUE does the trick:
test %>%
arrange(time) %>%
group_by(my_groups) %>%
summarise(mean_3 = rollapply(measure, 3, mean, by = 3, align = "left", partial = F),
final_data = rollapply(time, 3, max, by = 3, align = "left", partial = T))
`summarise()` has grouped output by 'my_groups'. You can override using the `.groups` argument.
# A tibble: 3 x 3
# Groups: my_groups [2]
my_groups mean_3 final_data
<chr> <dbl> <chr>
1 A 12.7 19-01-2021
2 A 11.7 25-09-2020
3 C 3.67 13-01-2021
Another possible solution:
library(tidyverse)
test = data.frame(my_groups = c("A", "A", "A", "B", "B", "C", "C", "C", "A", "A", "A"),
measure = c(10, 20, 5, 2, 62 ,2, 5, 4, 6, 7, 25),
time= c("20-09-2020", "25-09-2020", "19-09-2020", "20-05-2020", "20-06-2021",
"11-01-2021", "13-01-2021", "13-01-2021", "15-01-2021", "15-01-2021", "19-01-2021"))
test %>%
group_by(data.table::rleid(my_groups)) %>%
filter(n() == 3) %>%
summarise(
groups = unique(my_groups),
mean_3 = mean(measure), final_data = max(time), .groups = "drop") %>%
select(-1)
#> # A tibble: 3 × 3
#> groups mean_3 final_data
#> <chr> <dbl> <chr>
#> 1 A 11.7 25-09-2020
#> 2 C 3.67 13-01-2021
#> 3 A 12.7 19-01-2021
EDIT
To allow for calculation of mean of 2 values, as asked for in a comment below by the OP, I revised my code, using data.table::frollmean and data.table::frollapply:
library(tidyverse)
library(lubridate)
library(data.table)
n <- 2 # choose the number with which to calculate the mean
test %>%
group_by(rleid(my_groups)) %>%
summarise(
groups = unique(my_groups),
mean_n = frollmean(measure, n), final_data = frollapply(dmy(time), n, max) %>%
as_date(origin = lubridate::origin), .groups = "drop") %>%
drop_na(mean_n) %>% select(-1)
#> # A tibble: 7 × 3
#> groups mean_n final_data
#> <chr> <dbl> <date>
#> 1 A 15 2020-09-25
#> 2 A 12.5 2020-09-25
#> 3 B 32 2021-06-20
#> 4 C 3.5 2021-01-13
#> 5 C 4.5 2021-01-13
#> 6 A 6.5 2021-01-15
#> 7 A 16 2021-01-19
Title is complicated, but I don't know how to put this problem into words. So I'll demonstrate.
Here's my problem, with the desired output:
library(tibble)
# Input:
tribble(
~n_1, ~n_2, ~n_3, ~pct_1, ~pct_2, ~pct_3,
10, 20, 30, 0.1, 0.2, 0.3
)
#> # A tibble: 1 x 6
#> n_1 n_2 n_3 pct_1 pct_2 pct_3
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 10 20 30 0.1 0.2 0.3
# Desired output:
tribble(
~name, ~n, ~pct,
1, 10, 0.1,
2, 20, 0.2,
3, 30, 0.3
)
#> # A tibble: 3 x 3
#> name n pct
#> <dbl> <dbl> <dbl>
#> 1 1 10 0.1
#> 2 2 20 0.2
#> 3 3 30 0.3
I tried tidyr::pivot_longer(), but I can't get it right. Is there any way?
One option could be:
df %>%
pivot_longer(everything(),
names_to = c(".value", "name"),
names_pattern = "(.*)_(.)")
name n pct
<chr> <dbl> <dbl>
1 1 10 0.1
2 2 20 0.2
3 3 30 0.3
Try this approach. As your main variable is concatenated you can use separate() (using sep='_') after pivot_longer() and then pivot_wider() to obtain the expected dataframe. Here the code:
library(tidyverse)
#Code
df %>% pivot_longer(cols = everything()) %>%
separate(name,into = c('var','name'),sep = '_') %>%
pivot_wider(names_from = var,values_from=value)
Output:
# A tibble: 3 x 3
name n pct
<chr> <dbl> <dbl>
1 1 10 0.1
2 2 20 0.2
3 3 30 0.3
Some data used (the one you provided):
#Data
df <- structure(list(n_1 = 10, n_2 = 20, n_3 = 30, pct_1 = 0.1, pct_2 = 0.2,
pct_3 = 0.3), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame"))
My data frame looks like this:
id A T C G ref var
1 1 10 15 7 0 A C
2 2 11 9 2 3 A G
3 3 2 31 1 12 T C
I'd like to create two new columns: ref_count and var_count which will have following values:
Value from A column and value from C column, since ref is A and var is C
Value from A column and value from G column, since ref is A and var is G
etc.
So I'd like to select a column based on the value in another column for each row.
Thanks!
We can use pivot_longer to reshape into 'long' format, filter the rows and then reshape it to 'wide' format with pivot_wider
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(cols = A:G) %>%
group_by(id) %>%
filter(name == ref|name == var) %>%
mutate(nm1 = c('ref_count', 'var_count')) %>%
ungroup %>%
select(id, value, nm1) %>%
pivot_wider(names_from = nm1, values_from = value) %>%
left_join(df1, .)
# A tibble: 3 x 9
# id A T C G ref var ref_count var_count
#* <int> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
#1 1 10 15 7 0 A C 10 7
#2 2 11 9 2 3 A G 11 3
#3 3 2 31 1 12 T C 31 1
Or in base R, we can also make use of the vectorized row/column indexing
df1$refcount <- as.matrix(df1[2:5])[cbind(seq_len(nrow(df1)), match(df1$ref, names(df1)[2:5]))]
df1$var_count <- as.matrix(df1[2:5])[cbind(seq_len(nrow(df1)), match(df1$var, names(df1)[2:5]))]
data
df1 <- structure(list(id = 1:3, A = c(10, 11, 2), T = c(15, 9, 31),
C = c(7, 2, 1), G = c(0, 3, 12), ref = c("A", "A", "T"),
var = c("C", "G", "C")), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"))
The following is a tidyverse alternative without creating a long dataframe that needs filtering. It essentially uses tidyr::nest() to nest the dataframe by rows, after which the correct column can be selected for each row.
df1 %>%
nest(data = -id) %>%
mutate(
data = map(
data,
~mutate(., refcount = .[[ref]], var_count = .[[var]])
)
) %>%
unnest(data)
#> # A tibble: 3 × 9
#> id A T C G ref var refcount var_count
#> <int> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
#> 1 1 10 15 7 0 A C 10 7
#> 2 2 11 9 2 3 A G 11 3
#> 3 3 2 31 1 12 T C 31 1
A variant of this does not need the (assumed row-specific) id column but defines the nested groups from the unique values of ref and var directly:
df1 %>%
nest(data = -c(ref, var)) %>%
mutate(
data = pmap(
list(data, ref, var),
function(df, ref, var) {
mutate(df, refcount = df[[ref]], var_count = df[[var]])
}
)
) %>%
unnest(data)
The data were specified by akrun:
df1 <- structure(list(id = 1:3, A = c(10, 11, 2), T = c(15, 9, 31),
C = c(7, 2, 1), G = c(0, 3, 12), ref = c("A", "A", "T"),
var = c("C", "G", "C")), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"))