Say I have a data frame like this:
group1 <- c('a','a','a','a','a','a','b','b','b','b','b','b','b','b')
group2 <- c('x','y','x','y','x','y','x','y','x','y','x','y','x','y')
value <- round(runif(14, min=0, max=1), digits = 2)
df1 <- as.data.frame(cbind(group1,group2,value))
df1$value <- as.numeric(df1$value)
It is easy to get a new data frame with only the maximum values of each group, by using the dplyr package and summarise function:
df2 <- summarise(group_by(df1,group1),max_v = max(value))
But what I want is a new data frame with the 3 maximum values of each group, doing something like that:
df2 <- summarise(group_by(df1,group1),max_v = max(value),max2_v = secondmax(value),max3_v = thirdmax(value))
Is there a way to do that without using the sort function ?
We can use arrange/slice/spread way to get this
library(dplyr)
library(tidyr)
df1 %>%
group_by(group1) %>%
arrange(desc(value)) %>%
slice(seq_len(3)) %>%
mutate(Max = paste0("max_", row_number())) %>%
select(-group2) %>%
spread(Max, value)
# A tibble: 2 x 4
# Groups: group1 [2]
# group1 max_1 max_2 max_3
#* <fctr> <dbl> <dbl> <dbl>
#1 a 0.84 0.69 0.41
#2 b 0.89 0.72 0.54
data
df1 <- data.frame(group1,group2,value)
Related
I used group_map for the first time and think I do it correctly. This is my code:
library(REAT)
df <- data.frame(value = c(1,1,1, 1,0.5,0.1, 0,0,0,1), group = c(1,1,1, 2,2,2, 3,3,3,3))
haves <- df %>%
group_by(group) %>%
group_map(~gini(.x$value, coefnorm = TRUE))
The thing is that haves is a list rather than a data frame. What would I have to do to obtain this df
wants <- data.frame(group = c(1,2,3), gini = c(0,0.5625,1))
group gini
1 0.0000
2 0.5625
3 1.0000
Thanks!
You can use dplyr::summarize:
df %>%
group_by(group) %>%
summarize(gini = gini(value, coefnorm = TRUE))
#> # A tibble: 3 x 2
#> group gini
#> <dbl> <dbl>
#> 1 1 0
#> 2 2 0.562
#> 3 3 1
According to the documentation, group_map always produces a list. group_modify is an alternative that produces a tibble if the function does, but gini just outputs a vector. So, you could do something like this...
df %>%
group_by(group) %>%
group_modify(~tibble(gini = gini(.x$value, coefnorm = TRUE)))
# A tibble: 3 x 2
# Groups: group [3]
group gini
<dbl> <dbl>
1 1 0
2 2 0.562
3 3 1
Using data.table
library(data.table)
setDT(df)[, .(gini = gini(value, coefnorm = TRUE)), group]
For grouped datasets, we can specify .data if in case we don't want to use column names unquoted
library(dplyr)
df %>%
group_by(group) %>%
summarize(gini = gini(.data$value, coefnorm = TRUE))
I'am struggle with one task: I have a dataframe, where one column is always numeric and others are always factors. I don't know the index of numeric columns.
My task is: to group dataframe by all factors, then to find mean and sd within each group.
I have already done some part of work:
library(dplyr)
library(stats)
df <- data.frame(
col1 = sample(LETTERS[1:3], 100, replace=TRUE),
col2 = sample(LETTERS[1:3], 100, replace=TRUE),
col3 = rnorm(100))
df
find_mean_sd <- function(df){
numeric <- which(sapply(df,is.numeric)==TRUE)
columns <- names(df)[-numeric]
dots <- lapply(columns, as.symbol)
df %>%
group_by_(.dots=dots) %>%
summarise(mean = mean(df[,numeric]), SD= sd(df[,numeric]))
}
find_mean_sd(df)
I am confused with mean and sd: why do they the same for all groups? I wanted to get 9 different meanings.
In case you want to fix your code, you can try this:
library(dplyr)
find_mean_sd <- function(df){
numeric <- which(sapply(df,is.numeric)==TRUE)
columns <- names(df)[-numeric]
dots <- lapply(columns, as.symbol)
df %>%
group_by_(.dots=dots) %>%
summarise_all(funs(mean,sd)) # here you can summarise by the functions you need
}
find_mean_sd(df)
# A tibble: 9 x 4
# Groups: col1 [3]
col1 col2 mean SD
<fct> <fct> <dbl> <dbl>
1 A A 0.202 1.19
2 A B -0.141 0.950
3 A C 0.585 0.596
4 B A -0.0812 1.20
5 B B -0.380 1.18
6 B C 0.300 0.846
7 C A -0.152 0.705
8 C B 0.136 1.39
9 C C 0.263 0.762
I think the problem was that you use in a dplyr chain the df, that is not necessary in the part of the summarise for your purpose, despite the A. Suliman solution is more elegant.
We can use dplyr::*_if to select the required columns
library(dplyr)
df %>%
group_by_if(is.factor) %>%
summarise_if(is.numeric, list(mean=~mean(., na.rm = TRUE), SD=~sd(.,na.rm = TRUE)))
I have the following example, where I pass a simple dataframe to a function that summarizes a column. The name of the summarizing column, s, I would like to have as a parameter to the function:
df <- data.frame(id = c(1,1,1,1,1,2,2,2,2,2),
a=c(1:10),
b=c(10:19))
sum <- function(df, s){
df <- df %>%
group_by(id) %>%
summarize(s = sum(a))
return(df)
}
sum(df = df, s = "summarizing.column.label")
However, regardless of the value I set, the summarizing-column always get the same name s. Is there a way to alter it?
EDIT: The output I would like is:
sum(df = df, s = "summarizing.column.label")
id summarizing.column.label
<dbl> <int>
1 1.00 15
2 2.00 40
sum(df = df, s = "a")
id a
<dbl> <int>
1 1.00 15
2 2.00 40
If we are passing a quoted argument, then one option is after the summarise, we use rename_at
sumf <- function(df, s){
df %>%
group_by(id) %>%
summarize(a = sum(a))%>%
rename_at("a", ~ s)
}
sumf(df, s ="summarizing.column.label" )
# A tibble: 2 x 2
# id summarizing.column.label
# <dbl> <int>
#1 1.00 15
#2 2.00 40
sumf(df, s ="a" )
# A tibble: 2 x 2
# id a
# <dbl> <int>
#1 1.00 15
#2 2.00 40
Or another option is to make use of := with !!
sumf <- function(df, s){
df %>%
group_by(id) %>%
summarize(a = sum(a))%>%
rename(!! (s) := a)
}
sumf(df, s ="summarizing.column.label" )
# A tibble: 2 x 2
# id summarizing.column.label
# <dbl> <int>
#1 1.00 15
#2 2.00 40
Or within summarise
sumf <- function(df, s){
df %>%
group_by(id) %>%
summarise(!!(s) := sum(a))
}
sumf(df, s ="summarizing.column.label" )
Try this:
sum <- function(df, s){
df <- df %>%
group_by(id) %>%
summarize(!!s := sum(a))
return(df)
}
I want the count and proportion (of all of elements) of each group in a data frame (after filtering). This code produces the desired output:
library(dplyr)
df <- data_frame(id = sample(letters[1:3], 100, replace = TRUE),
value = rnorm(100))
summary <- filter(df, value > 0) %>%
group_by(id) %>%
summarize(count = n()) %>%
ungroup() %>%
mutate(proportion = count / sum(count))
> summary
# A tibble: 3 x 3
id count proportion
<chr> <int> <dbl>
1 a 17 0.3695652
2 b 13 0.2826087
3 c 16 0.3478261
Is there an elegant solution to avoid the ungroup() and second summarize() steps. Something like:
summary <- filter(df, value > 0) %>%
group_by(id) %>%
summarize(count = n(),
proportion = n() / [?TOTAL_ROWS()?])
I couldn't find such a function in the documentation, but I must be missing something obvious. Thanks!
You can use nrow on . which refers to the entire data frame piped in:
df %>%
filter(value > 0) %>%
group_by(id) %>%
summarise(count = n(), proportion = count / nrow(.))
# A tibble: 3 x 3
# id count proportion
# <chr> <int> <dbl>
#1 a 14 0.2592593
#2 b 22 0.4074074
#3 c 18 0.3333333
Using tidyr/dplyr, I have some factor columns which I'd like to Z-score, and then mutate an average Z-score, whilst retaining the original data for reference.
I'd like to avoid using a for loop in tidyr/dplyr, thus I'm gathering my data and performing my calculation (Z-score) on a single column. However, I'm struggling with restoring the wide format.
Here is a MWE:
library(dplyr)
library(tidyr)
# Original Data
dfData <- data.frame(
Name = c("Steve","Jwan","Ashley"),
A = c(10,20,12),
B = c(0.2,0.3,0.5)
) %>% tbl_df()
# Gather to Z-score
dfLong <- dfData %>% gather("Factor","Value",A:B) %>%
mutate(FactorZ = paste0("Z_",Factor)) %>%
group_by(Factor) %>%
mutate(ValueZ = (Value - mean(Value,na.rm = TRUE))/sd(Value,na.rm = TRUE))
# Now go wide to do some mutations (eg Z)Avg = (Z_A + Z_B)/2)
# This does not work
dfWide <- dfLong %>%
spread(Factor,Value) %>%
spread(FactorZ,ValueZ)%>%
mutate(Z_Avg = (Z_A+Z_B)/2)
# This is the desired result
dfDesired <- dfData %>% mutate(Z_A = (A - mean(A,na.rm = TRUE))/sd(A,na.rm = TRUE)) %>% mutate(Z_B = (B - mean(B,na.rm = TRUE))/sd(B,na.rm = TRUE)) %>%
mutate(Z_Avg = (Z_A+Z_B)/2)
Thanks for any help/input!
Another approach using dplyr (version 0.5.0)
library(dplyr)
dfData %>%
mutate_each(funs(Z = scale(.)), -Name) %>%
mutate(Z_Avg = (A_Z+B_Z)/2)
means <-function(x)mean(x, na.rm=T)
dfWide %>% group_by(Name) %>% summarise_each(funs(means)) %>% mutate(Z_Avg = (Z_A + Z_B)/2)
# A tibble: 3 x 6
Name A B Z_A Z_B Z_Avg
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Ashley 12 0.5 -0.3779645 1.0910895 0.3565625
2 Jwan 20 0.3 1.1338934 -0.2182179 0.4578378
3 Steve 10 0.2 -0.7559289 -0.8728716 -0.8144003
Here is one approach with long and wide format. For z-transformation, you can use the base function scale. Furthermore, this approach includes a join to combine the original data frame and the one including the new values.
dfLong <- dfData %>%
gather(Factor, Value, A:B) %>%
group_by(Factor) %>%
mutate(ValueZ = scale(Value))
# Name Factor Value ValueZ
# <fctr> <chr> <dbl> <dbl>
# 1 Steve A 10.0 -0.7559289
# 2 Jwan A 20.0 1.1338934
# 3 Ashley A 12.0 -0.3779645
# 4 Steve B 0.2 -0.8728716
# 5 Jwan B 0.3 -0.2182179
# 6 Ashley B 0.5 1.0910895
dfWide <- dfData %>% inner_join(dfLong %>%
ungroup %>%
select(-Value) %>%
mutate(Factor = paste0("Z_", Factor)) %>%
spread(Factor, ValueZ) %>%
mutate(Z_Avg = (Z_A + Z_B) / 2))
# Name A B Z_A Z_B Z_Avg
# <fctr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 Steve 10 0.2 -0.7559289 -0.8728716 -0.8144003
# 2 Jwan 20 0.3 1.1338934 -0.2182179 0.4578378
# 3 Ashley 12 0.5 -0.3779645 1.0910895 0.3565625
I would just do it all in wide format. No need to keep switching between the long and wide formats.
dfData %>%
mutate(Z_A=(A-mean(unlist(dfData$A)))/sd(unlist(dfData$A)),
Z_B=(B-mean(unlist(dfData$B)))/sd(unlist(dfData$B))) %>%
mutate(Z_AVG=(Z_A+Z_B)/2)