dplyr summarise_all with quantile and other functions - r

I have a dataframe PatientA
Height Weight Age BMI
<dbl> <dbl> <dbl> <dbl>
1 161 72.2 27 27.9
2 164 61.0 21 22.8
3 171 72.0 30 24.6
4 169. 63.9 25 22.9
5 174. 64.4 27 21.1
6 160 50.9 22 19.9
7 172 77.5 22 26.3
8 165 54.5 22 20
9 173 82.4 29 27.5
10 169 76.6 22 26.9
and I would like to get some statistics for each column. I have the next working code which deals only with quantiles
genStat <- PatientsA %>%
summarise_all(funs(list(quantile(., probs = c(0.25, 0.5, 0.75))))) %>%
unnest %>%
transpose %>%
setNames(., c('25%', '50%', '75%')) %>%
map_df(unlist) %>%
bind_cols(data.frame(vars = names(PatientsA)), .)
and I need to add mean and sd to summarise_all like this
genStat <- PatientsA %>%
summarise_all(funs(mean,sd,list(quantile(., probs = c(0.25, 0.5, 0.75))))) %>%
unnest %>%
transpose %>%
setNames(., c('mean','sd','25%', '50%', '75%')) %>%
map_df(unlist) %>%
bind_cols(data.frame(vars = names(PatientsA)), .)
This straightforward approach fails returning the next error:
Error in names(object) <- nm : 'names' attribute [5] must be the
same length as the vector [3]
I'm a newbie in R, so what is the right syntax for completing this task?

This is what I would suggest. There is a little repetition in the code (calling quantile three times) but overall I think it is easier to understand and debug.
library(tidyverse)
PatientsA %>%
gather("variable", "value") %>%
group_by(variable) %>%
summarize(mean_val = mean(value),
sd_val = sd(value),
q25 = quantile(value, probs = .25),
q50 = quantile(value, probs = .5),
q75 = quantile(value, probs = .75))
## A tibble: 4 x 6
# variable mean_val sd_val q25 q50 q75
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 Age 24.7 3.33 22 23.5 27
#2 BMI 24.0 3.08 21.5 23.8 26.7
#3 Height 168. 5.01 164. 169 172.
#4 Weight 67.5 10.3 61.7 68.2 75.5

We could also place the quantile output in a list and then unnest
library(tidyverse)
PatientsA %>%
gather %>%
group_by(key) %>%
summarise_at(vars('value'),
funs(mean,
sd,
quantile = list(as.tibble(as.list(quantile(.,
probs = c(0.25, 0.5, 0.75))))))) %>%
unnest
# A tibble: 4 x 6
# key mean sd `25%` `50%` `75%`
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 Age 24.7 3.33 22 23.5 27
#2 BMI 24.0 3.08 21.5 23.8 26.7
#3 Height 168. 5.01 164. 169 172.
#4 Weight 67.5 10.3 61.7 68.2 75.5
Or using pivot_longer
PatientsA %>%
pivot_longer(cols = everything()) %>%
group_by(name) %>%
summarise(across(value, list(mean= ~ mean(., na.rm = TRUE),
sd = ~ sd(., na.rm = TRUE),
quantile = ~ list(as_tibble(as.list(quantile(.,
probs = c(0.25, 0.5, 0.75)))))))) %>%
unnest(c(value_quantile))
# A tibble: 4 x 6
name value_mean value_sd `25%` `50%` `75%`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Age 24.7 3.33 22 23.5 27
2 BMI 24.0 3.08 21.5 23.8 26.7
3 Height 168. 5.01 164. 169 172.
4 Weight 67.5 10.3 61.7 68.2 75.5
###data
PatientsA <- structure(list(Height = c(161, 164, 171, 169, 174, 160, 172,
165, 173, 169), Weight = c(72.2, 61, 72, 63.9, 64.4, 50.9, 77.5,
54.5, 82.4, 76.6), Age = c(27L, 21L, 30L, 25L, 27L, 22L, 22L,
22L, 29L, 22L), BMI = c(27.9, 22.8, 24.6, 22.9, 21.1, 19.9, 26.3,
20, 27.5, 26.9)), class = "data.frame", row.names = c("1", "2",
"3", "4", "5", "6", "7", "8", "9", "10"))

Related

compute variable over the value of the difference between another variable this year and the previous one R

In the data below I want to compute the following ratio tr(year)/(op(year) - op(year-1). I would appreciate an answer with dplyr.
year op tr cp
<chr> <dbl> <dbl> <dbl>
1 1984 10 39.1 38.3
2 1985 55 132. 77.1
3 1986 79 69.3 78.7
4 1987 78 47.7 74.1
5 1988 109 77.0 86.4
this is the expected output
year2 ratio
1 1985 2.933333
2 1986 2.887500
3 1987 -47.700000
4 1988 -2.483871
I do not manage to get to any result...
Use lag:
library(dplyr)
df %>%
mutate(year = year,
ratio = tr / (op - lag(op)),
.keep = "none") %>%
tidyr::drop_na()
# year ratio
#2 1985 2.933333
#3 1986 2.887500
#4 1987 -47.700000
#5 1988 2.483871
We may use
library(dplyr)
df1 %>%
reframe(year = year[-1], ratio = tr[-1]/diff(op))
-output
year ratio
1 1985 2.933333
2 1986 2.887500
3 1987 -47.700000
4 1988 2.483871
data
df1 <- structure(list(year = 1984:1988, op = c(10L, 55L, 79L, 78L, 109L
), tr = c(39.1, 132, 69.3, 47.7, 77), cp = c(38.3, 77.1, 78.7,
74.1, 86.4)), class = "data.frame", row.names = c("1", "2", "3",
"4", "5"))

Multidimensional Crosstable (median values)

I have the following sample data:
samplesize=100
df <- data.frame(sex = sample(c("M", "F"), size = samplesize, replace = TRUE),
agegrp = sample(c("old", "middle", "young"), size = samplesize, replace = TRUE),
duration1 = runif(samplesize, min = 1, max = 100),
duration2 = runif(samplesize, min = 1, max = 100),
country = sample(c("USA", "CAN"), size = samplesize, replace = TRUE))
df
My goal is to plot a table like this that displays the median values [median(na.rm = TRUE) as there might be missing values]
USA CAN
total old middle young M F total old middle young M F
duration1 10.2 12.2 13.1 10.2 13.0 13.9 ... ... ... ... ... ...
duration2 10.4 13.2 13.2 10.0 13.1 14.0 ... ... ... ... ... ...
The way I would usually calculate such a table is to calculate the median values columnwise:
df %>%
group_by(country, agegrp) %>%
summarise(dur1 = median(duration1, na.rm = TRUE),
dur2 = median(duration, na.rm = TRUE)
And finally I put all the columns together. Unfortunately, as the number of combinations gets bigger, this methods becomes very cumbersome. So my question is:
Is there any function like table() that let's me calculate means or medians (instead of frequencies) using specific combinations of variables?
It would also be fine if it was just a two-dimensional table with multi-dimensional variable names like:
USA_total USA_old USA_middle USA_young USA_m USA_f CAN_total ...
duration1
duration2
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -c(sex, agegrp, country), names_to = "parameters") %>%
group_by(agegrp, country, parameters) %>%
summarise(mean = mean(value, na.rm=TRUE)) %>%
pivot_wider(names_from = c(country, agegrp), values_from = mean)
Returns:
parameters CAN_middle USA_middle CAN_old USA_old CAN_young USA_young
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 duration1 48.6 62.6 31.5 40.0 43.0 50.5
2 duration2 60.9 54.0 53.1 58.9 45.1 55.6
Edit
Including M and F:
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = c(sex, agegrp), names_to = "groupings_names", values_to="groupings") %>%
select(-groupings_names) %>%
pivot_longer(cols = -c(groupings, country), names_to = "parameters") %>%
group_by(groupings, country, parameters) %>%
summarise(mean = mean(value, na.rm=TRUE)) %>%
pivot_wider(names_from = c(country, groupings), values_from = mean)
parameters CAN_F USA_F CAN_M USA_M CAN_middle USA_middle CAN_old USA_old CAN_young USA_young
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 duration1 63.3 59.0 50.9 47.7 57.9 46.1 56.8 60.6 59.5 49.1
2 duration2 60.6 59.0 54.9 48.3 65.0 45.6 48.5 49.5 55.8 62.4

Using dplyr summarise() for specific columns within purrr map() with grouped data

I have a problem I'm trying to solve, and I can't seem to find a succinct solution. There are a few similar questions on SO, but nothing that quite fits.
Take some sample data:
library(dplyr)
dat <- tibble(
group1 = factor(sample(c("one", "two"), 10, replace = T)),
group2 = factor(sample(c("alpha", "beta"), 10, replace = T)),
var1 = rnorm(10, 20, 2),
var2 = rnorm(10, 20, 2),
var3 = rnorm(10, 20, 2),
other1 = sample(c("a", "b", "c"), 10, replace = T),
other2 = sample(c("a", "b", "c"), 10, replace = T),
)
I would like to summarise just the numeric variables (i.e. ignoring other1 and other2), but have the output grouped by group1 and group2.
I have tried something like this, but it returns an error as it attempts to apply my summarise() functions to the grouping variables too.
dat %>%
group_by(group1, group2) %>%
select(where(is.numeric)) %>%
map(~ .x %>%
filter(!is.na(.x)) %>%
summarise(mean = mean(.x),
sd = sd(.x),
median = median(.x),
q1 = quantile(.x, p = .25),
q3 = quantile(.x, p = .75))
)
My expected output would be something like
group1 group2 mean sd median q1 q3
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
1 one alpha ? ? ? ? ?
2 one beta ? ? ? ? ?
3 two alpha ? ? ? ? ?
4 two beta ? ? ? ? ?
Any solutions would be greatly appreciated.
Thanks,
Sam
Try:
dat %>% group_by(group1,group2) %>%
summarize(across(is.numeric,c(sd = sd,
mean = mean,
median =median,
q1 = function(x) quantile(x,.25),
q3 = function(x) quantile(x,.75))))
group1 group2 var1_sd var1_mean var1_median var1_q1 var1_q3 var2_sd var2_mean var2_median var2_q1 var2_q3 var3_sd
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 one alpha 4.06 20.6 19.3 18.3 22.2 1.12 17.9 17.3 17.2 18.2 1.09
2 one beta 0.726 18.7 18.7 18.4 18.9 0.348 18.8 18.8 18.7 18.9 0.604
3 two alpha 1.31 19.9 20.0 19.3 20.6 1.10 17.8 18.3 17.4 18.5 0.624
4 two beta 0.777 21.2 21.2 21.0 21.5 1.13 19.6 19.6 19.2 20.0 0.0161
You can also pass the columns to the functions in summarise:
dat %>%
group_by(group1, group2) %>%
summarise(mean = mean(var1:var3),
sd = sd(var1:var3),
median = median(var1:var3),
q1 = quantile(var1:var3, p = .25),
q3 = quantile(var1:var3, p = .75))
dat
# A tibble: 4 x 7
# Groups: group1 [2]
# group1 group2 mean sd median q1 q3
# <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 one alpha 19.1 0.707 19.1 18.8 19.3
# 2 one beta 17.5 1.29 17.5 16.8 18.3
# 3 two alpha 17.1 NA 17.1 17.1 17.1
# 4 two beta 19.9 NA 19.9 19.9 19.9

How to remove rows with 0 in numeric columns in R [duplicate]

This question already has answers here:
Extracting columns having greater than certain values in R dataframe
(5 answers)
Select columns that don't contain any NA value in R
(3 answers)
Closed 2 years ago.
i have the following Dataset:
structure(list(Species = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label =
c("Bream", "Parkki", "Perch", "Pike", "Roach", "Smelt", "Whitefish"),
class = "factor"),
WeightGRAM = c(242, 290, 340, 363, 430, 450), VertLengthCM = c(23.2,
24, 23.9, 26.3, 26.5, 26.8), DiagLengthCM = c(25.4, 26.3,
26.5, 29, 29, 29.7), CrossLengthCM = c(30, 31.2, 31.1, 33.5,
34, 34.7), HeightCM = c(11.52, 12.48, 12.3778, 12.73, 12.444,
13.6024), WidthCM = c(4.02, 4.3056, 4.6961, 4.4555, 5.134,
4.9274)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))
I am trying to check for "0" or negative values in the numeric columns and remove them.
I have the following code:
fish_data <- fish_data [which(rowSums(fish_data) > 0), ]
But i will get a error message:
Error in rowSums(fish_data) : 'x' must be numeric
I roughly guess because my "species" columns are factor, this message came up.
Can i know how can i skip the first column and ask R to check for only numeric columns for "0" or negative values?
Here is a way that keeps only the columns with no values less than or equal to zero.
keep <- sapply(fish_data, function(x) {
if(is.numeric(x)) all(x > 0) else TRUE
})
fish_data[keep]
## A tibble: 6 x 7
# Species WeightGRAM VertLengthCM DiagLengthCM CrossLengthCM HeightCM WidthCM
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 Bream 242 23.2 25.4 30 11.5 4.02
#2 Bream 290 24 26.3 31.2 12.5 4.31
#3 Bream 340 23.9 26.5 31.1 12.4 4.70
#4 Bream 363 26.3 29 33.5 12.7 4.46
#5 Bream 430 26.5 29 34 12.4 5.13
#6 Bream 450 26.8 29.7 34.7 13.6 4.93
Using dplyr we can use select to select columns where all values are greater than 0 or are not numeric.
library(dplyr)
df %>% select(where(~(is.numeric(.) && all(. > 0)) || !is.numeric(.)))
# A tibble: 6 x 7
# Species WeightGRAM VertLengthCM DiagLengthCM CrossLengthCM HeightCM WidthCM
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 Bream 242 23.2 25.4 30 11.5 4.02
#2 Bream 290 24 26.3 31.2 12.5 4.31
#3 Bream 340 23.9 26.5 31.1 12.4 4.70
#4 Bream 363 26.3 29 33.5 12.7 4.46
#5 Bream 430 26.5 29 34 12.4 5.13
#6 Bream 450 26.8 29.7 34.7 13.6 4.93
In the previous version of dplyr, we can use select_if :
df %>% select_if(~(is.numeric(.) && all(. > 0)) || !is.numeric(.))
you only need to specifiy the columns for the rowSums() function:
fish_data <- fish_data[which(rowSums(fish_data[,2:7]) > 0), ]
note that rowsums sums all values across the row im not sure if thats whta you really want to achieve?
you can check the output of rowsums with:
> rowSums(fish_data[,2:7])
[1] 336.1400 388.2856 438.5739 468.9855 537.0780 559.7298
Thanks all, i think i figure out.
i should be keying:
fish_data[fish_data <= 0] <- NA #convert records with less than or equal to 0 to NA
fish_data <- na.omit(fish_data) # delete rows with NA
But i will get a warning message:
Warning message: In Ops.factor(left, right) : ‘<=’ not meaningful for
factors
# Option 1: (Safer because will retain rows containing NAs)
# Subset data.frame to not contain any observations with 0 values:
# data.frame => stdout (console)
df[rowMeans(df != 0, na.rm = TRUE) == 1,]
# Option 2: (More dangerous because it will remove all rows containing
# NAs) subset data.frame to not contain any observations with 0 values:
# data.frame => stdout (console)
df[complete.cases(replace(df, df == 0, NA)),]
# Option 3 (Variant of Option 1):
# Subset data.frame to not contain any observations with 0 values:
# data.frame => stdout (console)
df[rowMeans(Vectorize(function(x){x != 0})(df[,sapply(df, is.numeric)]),
na.rm = TRUE) == 1,]
# Option 4: Using Higher-order functions:
# Subset data.frame to not contain any observations with 0 values:
# data.frame => stdout (console)
df[Reduce(function(y, z){intersect(y, z)},
Map(function(x){which(x > 0)}, df[,sapply(df, is.numeric)])), ]
# Option 5 tidyverse:
# Subset data.frame to not contain any observations with 0 values:
# data.frame => stdout (console)
library(dplyr)
df %>%
filter_if(is.numeric, all_vars(. > 0))
Data:
df <- structure(list(Species = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label =
c("Bream", "Parkki", "Perch", "Pike", "Roach", "Smelt", "Whitefish"),
class = "factor"),
WeightGRAM = c(242, 290, 340, 363, 0, 450), VertLengthCM = c(23.2,
24, 23.9, 26.3, 26.5, 26.8), DiagLengthCM = c(25.4, 26.3,
26.5, 29, 29, 29.7), CrossLengthCM = c(30, 31.2, 31.1, 33.5,
34, 34.7), HeightCM = c(11.52, 0, 12.3778, 12.73, 12.444,
13.6024), WidthCM = c(4.02, 4.3056, 4.6961, 4.4555, 5.134,
4.9274)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))

R: Add interpolated values in between columns of dataframe?

I have a data frame that looks like this
Region 2000 2001 2002 2003 2004 2005
Australia 15.6 18.4 19.2 20.2 39.1 50.2
Norway 19.05 20.2 15.3 10 10.1 5.6
and basically I need a quick way to add extra columns in-between the currently existing columns that contain interpolated values of the surrounding columns.
Think of it like this: say you don't want columns for every year, but rather columns for every quarter. Then, for every pair of years (like 2000 and 2001), we would need to add 3 extra columns in-between these years.
The values of these columns will just be interpolated values. So, for Australia, the value in 2000 is 15.6 and in 2001 it is 18.4. So we calculate (18.4 - 15.6)/4 = 0.7, and then the values should now be 15.6, 16.3, 17, 17.7, and finally 18.4.
I have a working solution that builds up the new dataframe from scratch using a for loop. It is EXTREMELY slow. How to speed this up?
This is how I did it when I had a similar problem. Not the most sophisticated solution but it works.
Australia=c( 15.6, 18.4, 19.2, 20.2, 39.1, 50.2)
library(zoo)
midpoints=rollmean(Australia, 2)
biyearly=c(rbind(Australia,midpoints))
midpoints=rollmean(biyearly, 2)
quarterly=c(rbind(biyearly,midpoints))
quarterly
#[1] 15.600 16.300 17.000 17.700 18.400 18.600 18.800 19.000 19.200 19.450 19.700
#[12] 19.950 20.200 24.925 29.650 34.375 39.100 41.875 44.650 47.425 50.200 33.600
#[23] 17.000 16.300
Here is one way with tidyverse:
library(tidyverse)
df %>%
#get data in long format
pivot_longer(cols = -Region) %>%
#group by Region
group_by(Region) %>%
#Create 4 number sequence between every 2 value
summarise(temp = list(unlist(map2(value[-n()], value[-1], seq, length.out = 4)))) %>%
#Get data in long format
unnest(temp) %>%
group_by(Region) %>%
#Create column name
mutate(col = paste0(rep(names(df)[-c(1, ncol(df))], each = 4), "Q", 1:4)) %>%
#Spread data in wide format
pivot_wider(names_from = col, values_from = temp)
# A tibble: 2 x 21
# Groups: Region [2]
# Region `2000Q1` `2000Q2` `2000Q3` `2000Q4` `2001Q1` `2001Q2` `2001Q3` `2001Q4` `2002Q1`
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 Austr… 15.6 16.5 17.5 18.4 18.4 18.7 18.9 19.2 19.2
#2 Norway 19.0 19.4 19.8 20.2 20.2 18.6 16.9 15.3 15.3
# … with 11 more variables: `2002Q2` <dbl>, `2002Q3` <dbl>, `2002Q4` <dbl>,
# `2003Q1` <dbl>, `2003Q2` <dbl>, `2003Q3` <dbl>, `2003Q4` <dbl>, `2004Q1` <dbl>,
# `2004Q2` <dbl>, `2004Q3` <dbl>, `2004Q4` <dbl>
data
df <- structure(list(Region = structure(1:2, .Label = c("Australia",
"Norway"), class = "factor"), `2000` = c(15.6, 19.05), `2001` = c(18.4,
20.2), `2002` = c(19.2, 15.3), `2003` = c(20.2, 10), `2004` = c(39.1,
10.1), `2005` = c(50.2, 5.6)), class = "data.frame", row.names = c(NA, -2L))
Here is a solution using dplyr. Should be more consistent and much faster than a loop:
# dummy data
df <- tibble(Region = LETTERS[1:5],
`2000` = 1:5,
`2001` = 3:7,
`2002` = 10:14)
# function to calculate quarterly values
into_quarter <- function(x) x / 4
df %>%
# create new variables that contain quarterly values
mutate_at(vars(starts_with("200")),
.funs = list("Q1" = into_quarter,
"Q2" = into_quarter,
"Q3" = into_quarter,
"Q4" = into_quarter)) %>%
# sort them approriatly.
# can also be done with base R and order(names), depending on desired result
select(Region,
starts_with("2000"),
starts_with("2001"),
starts_with("2002"),
# in case there are also other variables and to not loose any information
everything())

Resources