I have the following sample data:
samplesize=100
df <- data.frame(sex = sample(c("M", "F"), size = samplesize, replace = TRUE),
agegrp = sample(c("old", "middle", "young"), size = samplesize, replace = TRUE),
duration1 = runif(samplesize, min = 1, max = 100),
duration2 = runif(samplesize, min = 1, max = 100),
country = sample(c("USA", "CAN"), size = samplesize, replace = TRUE))
df
My goal is to plot a table like this that displays the median values [median(na.rm = TRUE) as there might be missing values]
USA CAN
total old middle young M F total old middle young M F
duration1 10.2 12.2 13.1 10.2 13.0 13.9 ... ... ... ... ... ...
duration2 10.4 13.2 13.2 10.0 13.1 14.0 ... ... ... ... ... ...
The way I would usually calculate such a table is to calculate the median values columnwise:
df %>%
group_by(country, agegrp) %>%
summarise(dur1 = median(duration1, na.rm = TRUE),
dur2 = median(duration, na.rm = TRUE)
And finally I put all the columns together. Unfortunately, as the number of combinations gets bigger, this methods becomes very cumbersome. So my question is:
Is there any function like table() that let's me calculate means or medians (instead of frequencies) using specific combinations of variables?
It would also be fine if it was just a two-dimensional table with multi-dimensional variable names like:
USA_total USA_old USA_middle USA_young USA_m USA_f CAN_total ...
duration1
duration2
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -c(sex, agegrp, country), names_to = "parameters") %>%
group_by(agegrp, country, parameters) %>%
summarise(mean = mean(value, na.rm=TRUE)) %>%
pivot_wider(names_from = c(country, agegrp), values_from = mean)
Returns:
parameters CAN_middle USA_middle CAN_old USA_old CAN_young USA_young
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 duration1 48.6 62.6 31.5 40.0 43.0 50.5
2 duration2 60.9 54.0 53.1 58.9 45.1 55.6
Edit
Including M and F:
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = c(sex, agegrp), names_to = "groupings_names", values_to="groupings") %>%
select(-groupings_names) %>%
pivot_longer(cols = -c(groupings, country), names_to = "parameters") %>%
group_by(groupings, country, parameters) %>%
summarise(mean = mean(value, na.rm=TRUE)) %>%
pivot_wider(names_from = c(country, groupings), values_from = mean)
parameters CAN_F USA_F CAN_M USA_M CAN_middle USA_middle CAN_old USA_old CAN_young USA_young
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 duration1 63.3 59.0 50.9 47.7 57.9 46.1 56.8 60.6 59.5 49.1
2 duration2 60.6 59.0 54.9 48.3 65.0 45.6 48.5 49.5 55.8 62.4
Related
I would like to cut my dataframe after a certain marker. Means after the first time 3 or more times TRUE shows up (=marker) in V1, I would like to cut the dataframes within a list and take the following next 4 rows as my new dataframe within a list.
library(dplyr)
set.seed(94756)
mat1 <- matrix(sample(seq(-1,100, 0.11),70, replace = TRUE),ncol = 5)
mat1 <- as.tibble(mat1)
mat2 <- matrix(sample(seq(-1,100, 0.11),70, replace = TRUE),ncol = 5)
mat2 <- as.tibble(mat2)
mat2[3,1] <- NA
mat2[6,1] <- NA
mat3 <- matrix(sample(seq(-1,100, 0.11), 70,replace = TRUE),ncol = 5)
mat3 <- as.tibble(mat3)
mat3[4,1] <- NA
data <- list(mat1, mat2, mat3)
data1 <- map(data, ~add_column(., V1_logical = between(.$V1, 20, 80), .after = 'V1'))
r_pre <- lapply(data1, "[", 2)
Maybe it is helpful to add an ID column for each dataframe within the list
r_pre1 <- rbindlist(r_pre, idcol = "ID")
r_pre1 <- split(r_pre1, r_pre1$ID)
So the result should be like:
mat1re <- data.frame(V1 = c(93.16, 47.18, 12.86, 38.71),
V2 = c(56.75, 57.85, 18.69, 3.18),
V3 = c(-0.01, 14.95, 46.08, 96.46),
V4 = c(20.89, 32.55, 91.73, 58.73),
V5 = c(66.54, 56.75, 92.94, 77.54))
mat2re <- data.frame(V1 = c(87.99, 53.23, 40.36, 0.65),
V2 = c(89.42, 81.28, 36.84, 73.58),
V3 = c(89.86, 78.75, 76.77, 61.81),
V4 = c(47.18, 22.98, 34.64, 25.18),
V5 = c(18.69, 77.21, 58.29, 94.04))
mat3re <- data.frame(V1 = c(81.50, 43.55, 54.55, 9.45),
V2 = c(33.21, 70.83, 21.66, 88.10),
V3 = c(72.15, -0.45, 11.65, 15.06),
V4 = c(47.07, 47.95, 88.10, 81.50),
V5 = c(80.07, 67.75, 14.84, 10.33))
result <- list(mat1re, mat2re, mat3re)
What I've tried already:
data2 <- lapply(data1, function(x) {x$V1_logical[x$V1_logical== TRUE] <- 1; x})
data3 <- lapply(data2, function(x) {x$V1_logical[x$V1_logical== FALSE] <- 0; x})
data4 <- map(data3, ~add_column(., ind = rleid(.$V1_logical), .after = "V1_logical"))
So in data 4 it's about to find the marker: $V1_logical = 1 & $ind = number that shows up >= 3 times consecutively (e. g. 5, 5, 5) and cut the data before away incl. marker or in other word start new dataframes after the marker.
The following code is also close, but doesn't cut the beginning incl. marker out when NA's are included in the data...Have a look at the second list here, doesn't cut the beginning and marker out.
matrix_final <- map(data, ~ .x %>%
mutate(V1_logical = between(V1, 20, 80), ind = rleid(V1_logical), .after = "V1") %>%
group_by(ind) %>%
mutate(rn = if(n() >=3 && first(V1_logical)) row_number() else NA_integer_) %>%
ungroup %>%
slice(seq(max(which.max(rn) + 1, 1, replace_na = TRUE), length.out = 4)) %>%
select(-ind, -rn) %>%
mutate(across(everything(), round, digits = 2)))
print(matrix_final[[2]])
Thanks in advance!
We may loop over the list with map, create the logical column on 'V1' with between, create a grouping column with rleid (returns a sequence column that increments when there is a change in value in adjacent elements) and slice the rows based on the condition
library(dplyr)
library(purrr)
library(data.table)
library(tidyr)
map(data, ~ .x %>%
mutate(V1_logical = replace_na(between(V1, 20, 80), FALSE),
ind = rleid(V1_logical), .after = "V1") %>%
group_by(ind) %>%
mutate(rn = if(n() >=3 && first(V1_logical)) row_number() else
NA_integer_) %>%
ungroup %>%
slice(seq(max(which.max(rn) + 1, 1, na.rm = TRUE), length.out = 4)) %>%
select(-ind, -rn, -V1_logical) %>%
mutate(across(everything(), round, digits = 2)))
-output
[[1]]
# A tibble: 4 × 5
V1 V2 V3 V4 V5
<dbl> <dbl> <dbl> <dbl> <dbl>
1 93.2 56.8 -0.0100 20.9 66.5
2 47.2 57.8 15.0 32.6 56.8
3 12.9 18.7 46.1 91.7 92.9
4 38.7 3.18 96.5 58.7 77.5
[[2]]
# A tibble: 4 × 5
V1 V2 V3 V4 V5
<dbl> <dbl> <dbl> <dbl> <dbl>
1 88.0 89.4 89.9 47.2 18.7
2 53.2 81.3 78.8 23.0 77.2
3 40.4 36.8 76.8 34.6 58.3
4 0.65 73.6 61.8 25.2 94.0
[[3]]
# A tibble: 4 × 5
V1 V2 V3 V4 V5
<dbl> <dbl> <dbl> <dbl> <dbl>
1 81.5 33.2 72.2 47.1 80.1
2 43.6 70.8 -0.45 48.0 67.8
3 54.6 21.7 11.6 88.1 14.8
4 9.45 88.1 15.1 81.5 10.3
I am currently working with the palmer penguins data set in R and want to summarise data that combines means, median, range and quants, grouping by sex.
My current solution has the quant data split from the summary data. Is there a way to do this in one go. If not how do I combine the data sets. The group quant is currently in long format, and I am not sure how to combine them.
group_summary <- penguins %>% group_by(sex) %>% summarize(mean = mean(bill_length_mm,
na.rm = TRUE), meadian = median(bill_length_mm, na.rm = TRUE), range =
max(bill_length_mm, na.rm = TRUE) - min(bill_length_mm, na.rm = TRUE))
group_quant <- penguins %>% group_by(sex) %>% summarize(quantile(bill_length_mm,
probs =seq(.1, 1, by = .1), na.rm =TRUE, .groups = 'drop'))
I had the following solution but it drops the NA values from Sex and I am not sure why.
group_summary <- do.call(data.frame,aggregate(bill_length_mm ~ sex, penguins,
function(x) c(mean = mean(x, na.rm = TRUE), median = median(x, na.rm = TRUE), range =
max(x, na.rm = TRUE) - min(x, na.rm = TRUE), quantile(x, probs = seq(.1, 1, by = .1),
na.rm = TRUE, .groups = 'drop'))))
You may save the quantiles in a list and then use unnest_wider to create new columns from them. To calculate range I used diff(range(...)) instead of max(...) - min(...). Both of them are fine but I included it to show an alternative.
library(palmerpenguins)
library(dplyr)
library(tidyr)
penguins %>%
group_by(sex) %>%
summarize(mean = mean(bill_length_mm, na.rm = TRUE),
median = median(bill_length_mm, na.rm = TRUE),
range = diff(range(bill_length_mm, na.rm = TRUE)),
quantile = list(quantile(bill_length_mm, probs = seq(.1, 1, by = .1), na.rm = TRUE))) %>%
unnest_wider(quantile)
# sex mean median range `10%` `20%` `30%` `40%` `50%` `60%` `70%` `80%` `90%` `100%`
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 female 42.1 42.8 25.9 35.8 36.7 38.2 40 42.8 45.1 45.7 46.5 47.5 58
#2 male 45.9 46.8 25 38.8 40.5 41.3 43.2 46.8 49.0 50.0 50.8 51.9 59.6
#3 NA 41.3 42 13.2 36.8 37.7 37.8 38.6 42 44 44.5 45.2 46.4 47.3
I have a problem I'm trying to solve, and I can't seem to find a succinct solution. There are a few similar questions on SO, but nothing that quite fits.
Take some sample data:
library(dplyr)
dat <- tibble(
group1 = factor(sample(c("one", "two"), 10, replace = T)),
group2 = factor(sample(c("alpha", "beta"), 10, replace = T)),
var1 = rnorm(10, 20, 2),
var2 = rnorm(10, 20, 2),
var3 = rnorm(10, 20, 2),
other1 = sample(c("a", "b", "c"), 10, replace = T),
other2 = sample(c("a", "b", "c"), 10, replace = T),
)
I would like to summarise just the numeric variables (i.e. ignoring other1 and other2), but have the output grouped by group1 and group2.
I have tried something like this, but it returns an error as it attempts to apply my summarise() functions to the grouping variables too.
dat %>%
group_by(group1, group2) %>%
select(where(is.numeric)) %>%
map(~ .x %>%
filter(!is.na(.x)) %>%
summarise(mean = mean(.x),
sd = sd(.x),
median = median(.x),
q1 = quantile(.x, p = .25),
q3 = quantile(.x, p = .75))
)
My expected output would be something like
group1 group2 mean sd median q1 q3
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
1 one alpha ? ? ? ? ?
2 one beta ? ? ? ? ?
3 two alpha ? ? ? ? ?
4 two beta ? ? ? ? ?
Any solutions would be greatly appreciated.
Thanks,
Sam
Try:
dat %>% group_by(group1,group2) %>%
summarize(across(is.numeric,c(sd = sd,
mean = mean,
median =median,
q1 = function(x) quantile(x,.25),
q3 = function(x) quantile(x,.75))))
group1 group2 var1_sd var1_mean var1_median var1_q1 var1_q3 var2_sd var2_mean var2_median var2_q1 var2_q3 var3_sd
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 one alpha 4.06 20.6 19.3 18.3 22.2 1.12 17.9 17.3 17.2 18.2 1.09
2 one beta 0.726 18.7 18.7 18.4 18.9 0.348 18.8 18.8 18.7 18.9 0.604
3 two alpha 1.31 19.9 20.0 19.3 20.6 1.10 17.8 18.3 17.4 18.5 0.624
4 two beta 0.777 21.2 21.2 21.0 21.5 1.13 19.6 19.6 19.2 20.0 0.0161
You can also pass the columns to the functions in summarise:
dat %>%
group_by(group1, group2) %>%
summarise(mean = mean(var1:var3),
sd = sd(var1:var3),
median = median(var1:var3),
q1 = quantile(var1:var3, p = .25),
q3 = quantile(var1:var3, p = .75))
dat
# A tibble: 4 x 7
# Groups: group1 [2]
# group1 group2 mean sd median q1 q3
# <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 one alpha 19.1 0.707 19.1 18.8 19.3
# 2 one beta 17.5 1.29 17.5 16.8 18.3
# 3 two alpha 17.1 NA 17.1 17.1 17.1
# 4 two beta 19.9 NA 19.9 19.9 19.9
I have a data frame that looks like this
Region 2000 2001 2002 2003 2004 2005
Australia 15.6 18.4 19.2 20.2 39.1 50.2
Norway 19.05 20.2 15.3 10 10.1 5.6
and basically I need a quick way to add extra columns in-between the currently existing columns that contain interpolated values of the surrounding columns.
Think of it like this: say you don't want columns for every year, but rather columns for every quarter. Then, for every pair of years (like 2000 and 2001), we would need to add 3 extra columns in-between these years.
The values of these columns will just be interpolated values. So, for Australia, the value in 2000 is 15.6 and in 2001 it is 18.4. So we calculate (18.4 - 15.6)/4 = 0.7, and then the values should now be 15.6, 16.3, 17, 17.7, and finally 18.4.
I have a working solution that builds up the new dataframe from scratch using a for loop. It is EXTREMELY slow. How to speed this up?
This is how I did it when I had a similar problem. Not the most sophisticated solution but it works.
Australia=c( 15.6, 18.4, 19.2, 20.2, 39.1, 50.2)
library(zoo)
midpoints=rollmean(Australia, 2)
biyearly=c(rbind(Australia,midpoints))
midpoints=rollmean(biyearly, 2)
quarterly=c(rbind(biyearly,midpoints))
quarterly
#[1] 15.600 16.300 17.000 17.700 18.400 18.600 18.800 19.000 19.200 19.450 19.700
#[12] 19.950 20.200 24.925 29.650 34.375 39.100 41.875 44.650 47.425 50.200 33.600
#[23] 17.000 16.300
Here is one way with tidyverse:
library(tidyverse)
df %>%
#get data in long format
pivot_longer(cols = -Region) %>%
#group by Region
group_by(Region) %>%
#Create 4 number sequence between every 2 value
summarise(temp = list(unlist(map2(value[-n()], value[-1], seq, length.out = 4)))) %>%
#Get data in long format
unnest(temp) %>%
group_by(Region) %>%
#Create column name
mutate(col = paste0(rep(names(df)[-c(1, ncol(df))], each = 4), "Q", 1:4)) %>%
#Spread data in wide format
pivot_wider(names_from = col, values_from = temp)
# A tibble: 2 x 21
# Groups: Region [2]
# Region `2000Q1` `2000Q2` `2000Q3` `2000Q4` `2001Q1` `2001Q2` `2001Q3` `2001Q4` `2002Q1`
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 Austr… 15.6 16.5 17.5 18.4 18.4 18.7 18.9 19.2 19.2
#2 Norway 19.0 19.4 19.8 20.2 20.2 18.6 16.9 15.3 15.3
# … with 11 more variables: `2002Q2` <dbl>, `2002Q3` <dbl>, `2002Q4` <dbl>,
# `2003Q1` <dbl>, `2003Q2` <dbl>, `2003Q3` <dbl>, `2003Q4` <dbl>, `2004Q1` <dbl>,
# `2004Q2` <dbl>, `2004Q3` <dbl>, `2004Q4` <dbl>
data
df <- structure(list(Region = structure(1:2, .Label = c("Australia",
"Norway"), class = "factor"), `2000` = c(15.6, 19.05), `2001` = c(18.4,
20.2), `2002` = c(19.2, 15.3), `2003` = c(20.2, 10), `2004` = c(39.1,
10.1), `2005` = c(50.2, 5.6)), class = "data.frame", row.names = c(NA, -2L))
Here is a solution using dplyr. Should be more consistent and much faster than a loop:
# dummy data
df <- tibble(Region = LETTERS[1:5],
`2000` = 1:5,
`2001` = 3:7,
`2002` = 10:14)
# function to calculate quarterly values
into_quarter <- function(x) x / 4
df %>%
# create new variables that contain quarterly values
mutate_at(vars(starts_with("200")),
.funs = list("Q1" = into_quarter,
"Q2" = into_quarter,
"Q3" = into_quarter,
"Q4" = into_quarter)) %>%
# sort them approriatly.
# can also be done with base R and order(names), depending on desired result
select(Region,
starts_with("2000"),
starts_with("2001"),
starts_with("2002"),
# in case there are also other variables and to not loose any information
everything())
I have a dataframe PatientA
Height Weight Age BMI
<dbl> <dbl> <dbl> <dbl>
1 161 72.2 27 27.9
2 164 61.0 21 22.8
3 171 72.0 30 24.6
4 169. 63.9 25 22.9
5 174. 64.4 27 21.1
6 160 50.9 22 19.9
7 172 77.5 22 26.3
8 165 54.5 22 20
9 173 82.4 29 27.5
10 169 76.6 22 26.9
and I would like to get some statistics for each column. I have the next working code which deals only with quantiles
genStat <- PatientsA %>%
summarise_all(funs(list(quantile(., probs = c(0.25, 0.5, 0.75))))) %>%
unnest %>%
transpose %>%
setNames(., c('25%', '50%', '75%')) %>%
map_df(unlist) %>%
bind_cols(data.frame(vars = names(PatientsA)), .)
and I need to add mean and sd to summarise_all like this
genStat <- PatientsA %>%
summarise_all(funs(mean,sd,list(quantile(., probs = c(0.25, 0.5, 0.75))))) %>%
unnest %>%
transpose %>%
setNames(., c('mean','sd','25%', '50%', '75%')) %>%
map_df(unlist) %>%
bind_cols(data.frame(vars = names(PatientsA)), .)
This straightforward approach fails returning the next error:
Error in names(object) <- nm : 'names' attribute [5] must be the
same length as the vector [3]
I'm a newbie in R, so what is the right syntax for completing this task?
This is what I would suggest. There is a little repetition in the code (calling quantile three times) but overall I think it is easier to understand and debug.
library(tidyverse)
PatientsA %>%
gather("variable", "value") %>%
group_by(variable) %>%
summarize(mean_val = mean(value),
sd_val = sd(value),
q25 = quantile(value, probs = .25),
q50 = quantile(value, probs = .5),
q75 = quantile(value, probs = .75))
## A tibble: 4 x 6
# variable mean_val sd_val q25 q50 q75
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 Age 24.7 3.33 22 23.5 27
#2 BMI 24.0 3.08 21.5 23.8 26.7
#3 Height 168. 5.01 164. 169 172.
#4 Weight 67.5 10.3 61.7 68.2 75.5
We could also place the quantile output in a list and then unnest
library(tidyverse)
PatientsA %>%
gather %>%
group_by(key) %>%
summarise_at(vars('value'),
funs(mean,
sd,
quantile = list(as.tibble(as.list(quantile(.,
probs = c(0.25, 0.5, 0.75))))))) %>%
unnest
# A tibble: 4 x 6
# key mean sd `25%` `50%` `75%`
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 Age 24.7 3.33 22 23.5 27
#2 BMI 24.0 3.08 21.5 23.8 26.7
#3 Height 168. 5.01 164. 169 172.
#4 Weight 67.5 10.3 61.7 68.2 75.5
Or using pivot_longer
PatientsA %>%
pivot_longer(cols = everything()) %>%
group_by(name) %>%
summarise(across(value, list(mean= ~ mean(., na.rm = TRUE),
sd = ~ sd(., na.rm = TRUE),
quantile = ~ list(as_tibble(as.list(quantile(.,
probs = c(0.25, 0.5, 0.75)))))))) %>%
unnest(c(value_quantile))
# A tibble: 4 x 6
name value_mean value_sd `25%` `50%` `75%`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Age 24.7 3.33 22 23.5 27
2 BMI 24.0 3.08 21.5 23.8 26.7
3 Height 168. 5.01 164. 169 172.
4 Weight 67.5 10.3 61.7 68.2 75.5
###data
PatientsA <- structure(list(Height = c(161, 164, 171, 169, 174, 160, 172,
165, 173, 169), Weight = c(72.2, 61, 72, 63.9, 64.4, 50.9, 77.5,
54.5, 82.4, 76.6), Age = c(27L, 21L, 30L, 25L, 27L, 22L, 22L,
22L, 29L, 22L), BMI = c(27.9, 22.8, 24.6, 22.9, 21.1, 19.9, 26.3,
20, 27.5, 26.9)), class = "data.frame", row.names = c("1", "2",
"3", "4", "5", "6", "7", "8", "9", "10"))