Data Frame to matrix - many rows - r

I'm trying to convert data.frame to matrix. I calculated some statistics for iris dataset and want every statistics to be placed in seperate row. Code below shows all statistics (avg and median) in one single row and that's not a desired output. I want sth like this:
stat Sepal.Lenght Sepal.Width ....
avg 10.5 .....
med ...... .....
Code below:
data_iris <- iris
avg <- data_iris %>%
summarise_at(vars(Sepal.Length:Petal.Width),mean,na.rm=TRUE)
med <- data_iris %>%
summarise_at(vars(Sepal.Length:Petal.Width),median,na.rm=TRUE)
column <- colnames(data_iris[1:4])
rown <- c("avg","median")
df <- data.frame(avg=avg,med=med)
m <- data.matrix(df)
And additional question: I'd like to calculate quantiles but error comes up:
qrtl <- data_iris %>%
summarise_at(vars(Sepal.Length:Petal.Width),quantile,na.rm=TRUE)
error: Column Sepal.Length must be length 1 (a summary value), not 5
What's wrong?

It can be done if we do a reshape into 'long' with pivot_longer
library(dplyr)
library(tidyr)
iris %>%
summarise_if(is.numeric, list(avg = mean, med = median)) %>%
pivot_longer(everything(), names_to = c('.value', 'stat'), names_sep="_")
# stat Sepal.Length Sepal.Width Petal.Length Petal.Width
#1 avg 5.843333 3.057333 3.758 1.199333
#2 med 5.800000 3.000000 4.350 1.300000
If it needs to be converted to matrix, then change the 'stat' to rownames and then use data.matrix
library(tibble)
iris %>%
summarise_if(is.numeric, list(avg = mean, med = median)) %>%
pivot_longer(everything(), names_to = c('.value', 'stat'), names_sep="_") %>%
column_to_rownames('stat') %>%
data.matrix
The quantile is working fine in the dev version of dplyr - 0.8.99.9000`
iris %>%
summarise_at(vars(Sepal.Length:Petal.Width),quantile, na.rm=TRUE)
# Sepal.Length Sepal.Width Petal.Length Petal.Width
#1 4.3 2.0 1.00 0.1
#2 5.1 2.8 1.60 0.3
#3 5.8 3.0 4.35 1.3
#4 6.4 3.3 5.10 1.8
#5 7.9 4.4 6.90 2.5
The OP's package version is 0.8.3, so may be wrapping with list would work
iris %>%
summarise_at(vars(Sepal.Length:Petal.Width),
list(quantile = ~ list(quantile(., na.rm=TRUE)))) %>%
unnest(c(names(.)))

We can use map with transpose and then bind rows from different statistics together.
library(purrr)
map(data_iris[1:4], ~list(mean = mean(.x), sd = sd(.x))) %>%
transpose() %>%
dplyr::bind_rows(.id = "statistics")
# A tibble: 2 x 5
# statistics Sepal.Length Sepal.Width Petal.Length Petal.Width
# <chr> <dbl> <dbl> <dbl> <dbl>
#1 mean 5.84 3.06 3.76 1.20
#2 sd 0.828 0.436 1.77 0.762
Or
map_df(data_iris[1:4], ~c(mean = mean(.x), sd = sd(.x)))

Related

select n-th largest row per group

I am trying to select the n-th largest row per group in a dataset. Example, look at the iris dataset - I found this code on the internet that does this for the second largest value of sepal.length for each type of flower species :
library(dplyr)
myfun <- function(x) {
u <- unique(x)
sort(u, decreasing = TRUE)[2L]
}
iris %>%
group_by(Species) %>%
summarise(result = myfun(Sepal.Length))`
I am just trying to clarification if I have understand this correctly. If I want 3rd largest, do I just make change like this? How I can select all rows from original data?
library(dplyr)
myfun <- function(x) {
u <- unique(x)
sort(u, decreasing = TRUE)[3L]
}
iris %>%
group_by(Species) %>%
summarise(result = myfun(Sepal.Length))
`
Just modify the function to have an extra argument n to make it dynamic
myfun <- function(x, n) {
u <- unique(x)
sort(u, decreasing = TRUE)[n]
}
and then call as
library(dplyr)
iris %>%
group_by(Species) %>%
summarise(result = myfun(Sepal.Length, 3))
-output
# A tibble: 3 × 2
Species result
<fct> <dbl>
1 setosa 5.5
2 versicolor 6.8
3 virginica 7.6
To get all the numeric columns, loop across the numeric columns
iris %>%
group_by(Species) %>%
summarise(across(where(is.numeric), ~ myfun(.x, 3)))
# or use nth
# summarise(across(where(is.numeric), ~ nth(unique(.x),
# order_by = -unique(.x), 3)))
-output
# A tibble: 3 × 5
Species Sepal.Length Sepal.Width Petal.Length Petal.Width
<fct> <dbl> <dbl> <dbl> <dbl>
1 setosa 5.5 4.1 1.6 0.4
2 versicolor 6.8 3.2 4.9 1.6
3 virginica 7.6 3.4 6.6 2.3
We could use nth from dplyr package after grouping and arrange:
library(dplyr)
iris %>%
group_by(Species) %>%
arrange(-Sepal.Length, .by_group = TRUE) %>%
summarise(across(, ~nth(unique(.x), 3)))
Species Sepal.Length Sepal.Width Petal.Length Petal.Width
<fct> <dbl> <dbl> <dbl> <dbl>
1 setosa 5.5 3.8 1.7 0.3
2 versicolor 6.8 2.8 4.8 1.7
3 virginica 7.6 2.8 6.9 2.3

Group t test result into columns within tidyverse

I'd like to group multiple t test result into one table. Originally my code looks like this:
tt_data <- iris %>%
group_by(Species) %>%
summarise(p = t.test(Sepal.Length,Petal.Length,alternative="two.sided",paired=T)$p.value,
estimate = t.test(Sepal.Length,Petal.Length,alternative="two.sided",paired=T)$estimate
)
tt_data
# Species p estimate
# setosa 2.542887e-51 3.544
# versicolor 9.667914e-36 1.676
# virginica 7.985259e-28 1.036
However, base on the idea that I should only perform the statistical test once, is there a way for me to run t test once per group and collect the intended table? I think there are some combination of broom and purrr but I am unfamiliar with the syntax.
# code idea (I know this won't work!)
tt_data <- iris %>%
group_by(Species) %>%
summarise(tt = t.test(Sepal.Length,Petal.Length,alternative="two.sided",paired=T)) %>%
select(Species, tt.p, tt.estimate)
tt_data
# Species tt.p tt.estimate
# setosa 2.542887e-51 3.544
# versicolor 9.667914e-36 1.676
# virginica 7.985259e-28 1.036
You can use broom::tidy() to transform the resut of the t.test to a tidy 'tibble':
library(dplyr)
library(broom)
iris %>%
group_by(Species) %>%
group_modify(~{
t.test(.$Sepal.Length,.$Petal.Length,alternative="two.sided",paired=T) %>%
tidy()
}) %>%
select(estimate, p.value)
#> Adding missing grouping variables: `Species`
#> # A tibble: 3 x 3
#> # Groups: Species [3]
#> Species estimate p.value
#> <fct> <dbl> <dbl>
#> 1 setosa 3.54 2.54e-51
#> 2 versicolor 1.68 9.67e-36
#> 3 virginica 1.04 7.99e-28
Created on 2020-09-02 by the reprex package (v0.3.0)
You can use map to select the desired values from the list generated by t.test and by tidying it up to a data frame via broom::tidy, i.e.
library(dplyr)
iris %>%
group_by(Species) %>%
summarise(p = list(broom::tidy(t.test(Sepal.Length, Petal.Length, alternative = "two.sided", paired = T)))) %>%
mutate(p.value = purrr::map(p, ~select(.x, c('p.value', 'estimate')))) %>%
select(-p) %>%
unnest()
# A tibble: 3 x 3
# Species p.value estimate
# <fct> <dbl> <dbl>
#1 setosa 2.54e-51 3.54
#2 versicolor 9.67e-36 1.68
#3 virginica 7.99e-28 1.04

How to use "summarise" from dplyr with dynamic column names?

I am summarizing group means from a table using the summarize function from the dplyr package in R. I would like to do this dynamically, using a column name string stored in another variable.
The following is the "normal" way and it works, of course:
myTibble <- group_by( iris, Species)
summarise( myTibble, avg = mean( Sepal.Length))
# A tibble: 3 x 2
Species avg
<fct> <dbl>
1 setosa 5.01
2 versicolor 5.94
3 virginica 6.59
However, I would like to do something like this instead:
myTibble <- group_by( iris, Species)
colOfInterest <- "Sepal.Length"
summarise( myTibble, avg = mean( colOfInterest))
I've read the Programming with dplyr page, and I've tried a bunch of combinations of quo, enquo, !!, .dots=(...), etc., but I haven't figured out the right way to do it yet.
I'm also aware of this answer, but, 1) when I use the standard-evaluation function standardise_, R tells me that it's depreciated, and 2) that answer doesn't seem elegant at all. So, is there a good, easy way to do this?
Thank you!
1) Use !!sym(...) like this:
colOfInterest <- "Sepal.Length"
iris %>%
group_by(Species) %>%
summarize(avg = mean(!!sym(colOfInterest))) %>%
ungroup
giving:
# A tibble: 3 x 2
Species avg
<fct> <dbl>
1 setosa 5.01
2 versicolor 5.94
3 virginica 6.59
2) A second approach is:
colOfInterest <- "Sepal.Length"
iris %>%
group_by(Species) %>%
summarize(avg = mean(.data[[colOfInterest]])) %>%
ungroup
Of course this is straight forward in base R:
aggregate(list(avg = iris[[colOfInterest]]), iris["Species"], mean)
Another solution:
iris %>%
group_by(Species) %>%
summarise_at(vars("Sepal.Length"), mean) %>%
ungroup()
# A tibble: 3 x 2
Species Sepal.Length
<fct> <dbl>
1 setosa 5.01
2 versicolor 5.94
3 virginica 6.59

R dplyr: Write list output to dataframe

Suppose I have the following function
SlowFunction = function(vector){
return(list(
mean =mean(vector),
sd = sd(vector)
))
}
And I would like to use dplyr:summarise to write the results to a dataframe:
iris %>%
dplyr::group_by(Species) %>%
dplyr::summarise(
mean = SlowFunction(Sepal.Length)$mean,
sd = SlowFunction(Sepal.Length)$sd
)
Does anyone have a suggestion how I can do this by calling "SlowFunction" once instead of twice? (In my code "SlowFunction" is a slow function that I have to call many times.) Without splitting "SlowFunction" in two parts of course. So actually I would like to somehow fill multiple columns of a dataframe in one statement.
Without changing your current SlowFunction one way is to use do
library(dplyr)
iris %>%
group_by(Species) %>%
do(data.frame(SlowFunction(.$Sepal.Length)))
# Species mean sd
# <fct> <dbl> <dbl>
#1 setosa 5.01 0.352
#2 versicolor 5.94 0.516
#3 virginica 6.59 0.636
Or with group_split + purrr::map_dfr
bind_cols(Species = unique(iris$Species), iris %>%
group_split(Species) %>%
map_dfr(~SlowFunction(.$Sepal.Length)))
An option is to use to store the output of SlowFunction in a list column of data.frames and then to use unnest
iris %>%
group_by(Species) %>%
summarise(res = list(as.data.frame(SlowFunction(Sepal.Length)))) %>%
unnest()
## A tibble: 3 x 3
# Species mean sd
# <fct> <dbl> <dbl>
#1 setosa 5.01 0.352
#2 versicolor 5.94 0.516
#3 virginica 6.59 0.636
We can use group_map if you are using dplyr 0.8.0 or later. The output from SlowFunction needs to be converted to a data frame.
library(dplyr)
iris %>%
group_by(Species) %>%
group_map(~SlowFunction(.x$Sepal.Length) %>% as.data.frame())
# # A tibble: 3 x 3
# # Groups: Species [3]
# Species mean sd
# <fct> <dbl> <dbl>
# 1 setosa 5.01 0.352
# 2 versicolor 5.94 0.516
# 3 virginica 6.59 0.636
We can change the SlowFunction to return a tibble and
SlowFunction = function(vector){
tibble(
mean =mean(vector),
sd = sd(vector)
)
}
and then unnest the summarise output in a list
iris %>%
group_by(Species) %>%
summarise(out = list(SlowFunction(Sepal.Length))) %>%
unnest
# A tibble: 3 x 3
# Species mean sd
# <fct> <dbl> <dbl>
#1 setosa 5.01 0.352
#2 versicolor 5.94 0.516
#3 virginica 6.59 0.636

Using pre-assigned column name within a pipe

I would like to pre-assign my column name and use that within a dplyr pipe
Here's an example. I want to do this:
iris %>%
group_by(Species) %>%
summarise(Var = mean(Petal.Length[Sepal.Width > 3]))
But with the column name assigned outside of the pipe, like this
col_name <- "Petal.Length"
iris %>%
group_by(Species) %>%
summarise(Var = mean(!!col_name[Sepal.Width > 3]))
We can convert to symbol (sym) and then do the evaluation (!!)
iris %>%
group_by(Species) %>%
summarise(Var = mean((!!rlang::sym(col_name))[Sepal.Width >3]))
# A tibble: 3 x 2
# Species Var
# <fct> <dbl>
#1 setosa 1.48
#2 versicolor 4.65
#3 virginica 5.72
If we need to use only dplyr, then can pass the variable object in summarise_at
iris %>%
group_by(Species) %>%
summarise_at(vars(col_name), funs(mean(.[Sepal.Width > 3])))
# A tibble: 3 x 2
# Species Petal.Length
# <fct> <dbl>
#1 setosa 1.48
#2 versicolor 4.65
#3 virginica 5.72

Resources