Separate formula - r

I keep getting this error, when i try separate my column by ".". My code is
txt <- "'Rural Male' 'Rural Female' 'Urban Male' 'Urban Female'
50-54 11.7 8.7 15.4 8.4
55-59 18.1 11.7 24.3 13.6
60-64 26.9 20.3 37.0 19.3
65-69 41.0 30.9 54.6 35.1
70-74 66.0 54.3 71.1 50.0)"
data <- read.table(header = TRUE, text = txt)
datanew <- data %>% tbl_df() %>% mutate(age= row.names(data)) %>% gather(key, death_rate, -age)`
separate(data = datanew,col = key, sep = ".", into = c("a","b"))
Warning message
Expected 2 pieces. Additional pieces discarded in 20 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20].

Use sep uses regular expressions, and in regular expressions . means "any character". You need to escape it with backslashes to get a literal ., use sep = "\\."
separate(data = datanew, col = key, sep = "\\.", into = c("a","b"))
# # A tibble: 20 x 4
# age a b death_rate
# <chr> <chr> <chr> <chr>
# 1 50-54 Rural Male 11.7
# 2 55-59 Rural Male 18.1
# 3 60-64 Rural Male 26.9
# 4 65-69 Rural Male 41
# 5 70-74 Rural Male 66
# ...
The default for separate is any non-letter non-number. So in this case you could also just use the default:
separate(data = datanew, col = key, into = c("a","b"))
# same result

Related

Classify table based on value 'moving window' range and proportions?

I have a datasets of forest stands, each containing several tree layers of different age and volume.
I want to classify the stands as even- or uneven-aged, combining volume and age data. The forest is considered even-aged if more then 80% of the volume is allocated to age classes within 20 years apart. I wonder how to implement the 'within 20 years apart' condition? I can easily calculate the sum of volume and it's share for individual tree layers (strat). But how to check for 'how many years they are apart?' Is it some sort of moving window?
Dummy example:
# investigate volume by age classes?
library(dplyr)
df <- data.frame(stand = c("id1", "id1", "id1", "id1",
'id2', 'id2', 'id2'),
strat = c(1,2,3,4,
1,2,3),
v = c(4,10,15,20,
11,15,18),
age = c(5,10,65,80,
10,15,20))
# even age = if more of teh 80% of volume is allocated in layers in 20 years range
df %>%
group_by(stand) %>%
mutate(V_tot = sum(v)) %>%
mutate(V_share = v/V_tot*100)
Expected outcome:
stand strat v age V_tot V_share quality
<fct> <dbl> <dbl> <dbl> <dbl> <dbl>
1 id1 1 4 5 49 8.16 uneven-aged
2 id1 2 10 10 49 20.4 uneven-aged
3 id1 3 15 65 49 30.6 uneven-aged
4 id1 4 20 80 49 40.8 uneven-aged #* because age classes 65 and 80, even less then 20 years apart have only 70% of total volume
5 id2 1 11 10 44 25 even-aged
6 id2 2 15 15 44 34.1 even-aged
7 id2 3 18 20 44 40.9 even-aged
Another tidyverse solution implementing a moving average:
library(tidyverse)
df <- structure(list(stand = c("id1", "id1", "id1", "id1", "id2", "id2", "id2"), strat = c(1, 2, 3, 4, 1, 2, 3), v = c(4, 10, 15, 20, 11, 15, 18), age = c(5, 10, 65, 80, 10, 15, 20), V_tot = c(49, 49, 49, 49, 44, 44, 44), V_share = c(8.16326530612245, 20.4081632653061, 30.6122448979592, 40.8163265306122, 25, 34.0909090909091, 40.9090909090909)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -7L))
df %>%
group_by(stand) %>%
mutate(range20 = map_dbl(age, ~ sum(V_share[which(abs(age - .x) <= 20)])),
quality = ifelse(any(range20 > 80), "even-aged", "uneven-aged"))
#> # A tibble: 7 × 8
#> # Groups: stand [2]
#> stand strat v age V_tot V_share range20 quality
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 id1 1 4 5 49 8.16 28.6 uneven-aged
#> 2 id1 2 10 10 49 20.4 28.6 uneven-aged
#> 3 id1 3 15 65 49 30.6 71.4 uneven-aged
#> 4 id1 4 20 80 49 40.8 71.4 uneven-aged
#> 5 id2 1 11 10 44 25 100 even-aged
#> 6 id2 2 15 15 44 34.1 100 even-aged
#> 7 id2 3 18 20 44 40.9 100 even-aged
Created on 2021-09-08 by the reprex package (v2.0.1)
Interesting issue, I think I have a solution using the runner package
df %>%
group_by(stand) %>%
mutate(
V_tot = sum(v),
V_share = v/V_tot*100,
test = sum_run(
V_share,
k = 20L,
idx = age,
na_rm = TRUE,
na_pad = FALSE
),
quality = if_else(any(test >= 80), 'even-aged', 'uneven-aged')
) %>%
select(-test)

calculating medians per year per ID in R and plotting the outcome

Dataset:
structure(list(ID = c(1234, 1234, 1234, 1234, 1234, 1234, 1234,
1234, 8769, 8769, 8769, 8769, 8769, 7457, 7457, 7457, 7457, 7457,
7457, 55667, 55667, 55667, 55667, 55667, 55667, 55667, 3789,
3789, 3789, 3789, 3789, 3789), date_of_bloods = structure(c(978307200,
981072000, 1173052800, 1175731200, 1367798400, 1465171200, 1467936000,
1659916800, 1072915200, 1075680000, 1173052800, 1175731200, 1367798400,
978307200, 981072000, 1173052800, 1175731200, 1367798400, 1465171200,
978307200, 981072000, 1173052800, 1270425600, 1273104000, 1465171200,
1467936000, 1270425600, 1367798400, 1465171200, 1465257600, 1465344000,
1465430400), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
result = c(90, 80, 60, 40, 25, 22, 22, 21, 70, 65, 43, 23,
22, 90, 90, 88, 86, 76, 74, 58, 46, 35, 34, 33, 30, 24, 76,
67, 56, 34, 33, 23), `mutation type` = c(1, 1, 1, 1, 1, 1,
1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,
3, 1, 1, 1, 1, 1, 1)), row.names = c(NA, -32L), class = "data.frame")
I would like the median of results per year per ID in a format where the year is just 0,1,2,3 etc for uniformity across cohorts and then to plot these lines with some indication of their mutation category.
I have done:
filtered$date_of_bloods <-format(filtered$date_of_bloods,format="%Y")
#split into individual ID groups
a <- with(filtered, split(filtered, list(ID)))
#aggregate median results per year
medianfunc <- function(y) {aggregate(results ~ date_of_bloods, data = y, median)}
medians <- sapply(a, medianfunc)
# do lm per ID cohort and get slope of lines
g<- as.data.frame(medians)
coefLM <- function(x) {coef(lm(date_of_bloods ~ results, data = x))}
coefs<- sapply(g, coefLM)
The actual years don't matter and for uniformity I would like them to be 0,1,2,3,4 etc per ID. I am not sure how to do that? I would then want to plot this data (median yearly bloods per ID) with some form of idea as to which mutational category they belong.
I hope this isn't too broad a question.
Many thanks
You can try this (filtered is the dput() you included). I hope this helps:
library(dplyr)
library(lubridate)
library(ggplot2)
library(broom)
#Data
filtered %>% mutate(year=year(date_of_bloods)) %>%
group_by(ID,year,`mutation type`) %>% summarise(med=median(result)) -> df1
#Variables
df1 %>% ungroup()%>% mutate(ID=as.factor(ID),
year=as.factor(year),
`mutation type`=as.factor(`mutation type`)) -> df1
#Plot
ggplot(df1,aes(x=ID,y=med,fill=`mutation type`,color=year,group=year))+
geom_line()
And for models:
#Models
fits <- df1 %>%group_by(ID) %>%
do(fitmodel = lm(med ~ year, data = .))
#Coefs
dfCoef = tidy(fits, fitmodel)
# A tibble: 10 x 6
# Groups: ID [5]
ID term estimate std.error statistic p.value
<dbl> <chr> <dbl> <dbl> <dbl> <dbl>
1 1234 (Intercept) 6329. 1546. 4.09 0.0264
2 1234 year -3.13 0.769 -4.07 0.0268
3 3789 (Intercept) 14318. 4746. 3.02 0.204
4 3789 year -7.08 2.36 -3.00 0.205
5 7457 (Intercept) 2409. 403. 5.98 0.0269
6 7457 year -1.16 0.201 -5.78 0.0287
7 8769 (Intercept) 9268. 4803. 1.93 0.304
8 8769 year -4.60 2.39 -1.92 0.306
9 55667 (Intercept) 3294. 759. 4.34 0.0492
10 55667 year -1.62 0.378 -4.29 0.0503
Code for required plot:
#Plot 2
#Data modifications
df1 %>% mutate(year2=as.numeric(year)-1) -> df2
df2 %>% mutate(year2=factor(year2,levels = sort(unique(year2)))) -> df2
#Plot 2
ggplot(df2,aes(x=year2,y=med,color=ID,group=ID))+
facet_wrap(.~`mutation type`)+
geom_line()
Your naming structure is unclear, if the data you provided is called df then you can do:
df$year <-format(df$date_of_bloods,format="%Y")
aggregate(result ~ year + ID, data = df, median)
year ID result
1 2001 1234 85.0
2 2007 1234 50.0
3 2013 1234 25.0
4 2016 1234 22.0
5 2022 1234 21.0
6 2010 3789 76.0
7 2013 3789 67.0
8 2016 3789 33.5
9 2001 7457 90.0
10 2007 7457 87.0
11 2013 7457 76.0
12 2016 7457 74.0
13 2004 8769 67.5
14 2007 8769 33.0
15 2013 8769 22.0
16 2001 55667 52.0
17 2007 55667 35.0
18 2010 55667 33.5
19 2016 55667 27.0

Dumbbell plot: Order and value label

I am working on a dumbbell plot in R inspired by this post, and have two problems:
Ordering the dumbbell plot (I've tried a strategy provided in this post)
Present value labels in the plot in an aesthetically pleasing way.
My data set is formatted as a wide data set with 18 units with the following structure:
> head(ADHD_med_2010_2018_wide, 18)
# A tibble: 18 x 9
age gender county adhd_pr_1000_2010 adhd_pr_1000_2018 county_label adhd_2010 adhd_2018 diff
<dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl> <dbl> <fct> <dbl> <dbl> <dbl>
1 2 [10-14] 1 [Both genders] 1 [Østfold] 32.1 24.3 Østfold 32.1 24.3 -7.80
2 2 [10-14] 1 [Both genders] 2 [Akershus] 20.6 23.0 Akershus 20.6 23 2.40
3 2 [10-14] 1 [Both genders] 3 [Oslo] 17.2 33.9 Oslo 17.2 33.9 16.7
4 2 [10-14] 1 [Both genders] 4 [Hedmark] 41.7 30.9 Hedmark 41.7 30.9 -10.8
5 2 [10-14] 1 [Both genders] 5 [Oppland] 24.9 39.0 Oppland 24.9 39 14.1
6 2 [10-14] 1 [Both genders] 6 [Buskerud] 26.7 36.8 Buskerud 26.7 36.8 10.1
7 2 [10-14] 1 [Both genders] 7 [Vestfold] 28.1 27.1 Vestfold 28.1 27 -1.10
8 2 [10-14] 1 [Both genders] 8 [Telemark] 29.2 24.7 Telemark 29.2 24.7 -4.5
9 2 [10-14] 1 [Both genders] 9 [Aust-Agder] 34.9 39.2 Aust-Agder 34.9 39.2 4.30
10 2 [10-14] 1 [Both genders] 10 [Vest-Agder] 17.4 23.8 Vest-Agder 17.4 23.8 6.40
11 2 [10-14] 1 [Both genders] 11 [Rogaland] 29.5 13.8 Rogaland 29.5 13.8 -15.7
12 2 [10-14] 1 [Both genders] 12 [Hordaland] 21.3 14.4 Hordaland 21.3 14.4 -6.90
13 2 [10-14] 1 [Both genders] 14 [Sogn og Fjordane] 21.3 39.7 Sogn og Fjordane 21.3 39.7 18.4
14 2 [10-14] 1 [Both genders] 15 [Møre og Romsdal] 27.0 18.6 Møre og Romsdal 27 18.6 -8.40
15 2 [10-14] 1 [Both genders] 18 [Nordland] 40.1 30.0 Nordland 40.1 30 -10.1
16 2 [10-14] 1 [Both genders] 19 [Troms] 25.8 33.2 Troms 25.8 33.2 7.40
17 2 [10-14] 1 [Both genders] 20 [Finnmark] 19.1 21.3 Finnmark 19.1 21.3 2.20
18 2 [10-14] 1 [Both genders] 50 [Trøndelag] 25.0 36.9 Trøndelag 25 37 12
I've tried two strategies for problem 1:
library("tidyverse")
library("ggalt")
fig2 <- ggplot(ADHD_med_2010_2018_wide, aes(x=adhd_2010, xend=adhd_2018, y=county_label, group=county_label)) +
#create a thick line between x and xend instead of using defaut
#provided by geom_dubbell
geom_segment(aes(x=adhd_2010,
xend=adhd_2018,
y=county_label,
yend=county_label),
color="#b2b2b2", size=1.5)+
geom_dumbbell(color="light blue",
size_x=3.5,
size_xend = 3.5,
#Note: there is no US:'color' for UK:'colour'
# in geom_dumbbel unlike standard geoms in ggplot()
colour_x="forestgreen", # green = 2010
colour_xend = "red")+ # red = 2018
labs(x=NULL, y=NULL,
title="Dumbbell Chart",
subtitle="Change in prescription rate: 2010 vs 2018")+
geom_text(color="black", size=2, hjust=-0.5,
aes(x=adhd_2010, label=adhd_2010))+
geom_text(aes(x=adhd_2018, label=adhd_2018),
color="black", size=2, hjust=1.5)
fig2
Which gives a plot without ordering or values presented in a good way:
To correct ordering, I tried following the strategy provided in the post linked above:
library(dplyr)
ADHD_med_2010_2018_wide%>%
mutate(difference = abs(adhd_2018-adhd_2010)) %>% #creates the variable of differences
top_n(18, wt = difference) %>% # Choose the rows with top 20 difference
ggplot() +
aes(x=adhd_2010, xend=adhd_2018, y=reorder(county_label, difference),
group=county_label) + #reorder the labels by descending difference value
geom_dumbbell(color="light blue",
size_x=3.5,
size_xend = 3.5,
#Note: there is no US:'color' for UK:'colour'
# in geom_dumbbel unlike standard geoms in ggplot()
colour_x="forestgreen", # green = 2010
colour_xend = "red")+ # red = 2018
labs(x=NULL, y=NULL,
title="Dumbbell Chart",
subtitle="Change in prescription rate: 2010 vs 2018")+
geom_text(color="black", size=2, hjust=-0.5,
aes(x=adhd_2010, label=adhd_2010))+
geom_text(aes(x=adhd_2018, label=adhd_2018),
color="black", size=2, hjust=1.5)
This still does not give a plot with a nice ordering, although it seem to order the difference (and there's still the issue with value labels):
Hopefully some of you may have input on these issues.
Data to copy:
> dput(head(ADHD_med_2010_2018_wide, 18))
structure(list(age = structure(c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2), label = "Age groups", labels = c(`5-9` = 1,
`10-14` = 2, `15-19` = 3, `20-24` = 4, `25-29` = 5, `30-34` = 6,
`All ages` = 7), class = "haven_labelled"), gender = structure(c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), label = "Gender", labels = c(`Both genders` = 1,
Female = 2, Male = 3), class = "haven_labelled"), county = structure(c(1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 18, 19, 20, 50), labels = c(Østfold = 1,
Akershus = 2, Oslo = 3, Hedmark = 4, Oppland = 5, Buskerud = 6,
Vestfold = 7, Telemark = 8, `Aust-Agder` = 9, `Vest-Agder` = 10,
Rogaland = 11, Hordaland = 12, `Sogn og Fjordane` = 14, `Møre og Romsdal` = 15,
Nordland = 18, Troms = 19, Finnmark = 20, Trøndelag = 50, `Hele landet` = 99
), class = "haven_labelled"), adhd_pr_1000_2010 = c(32.1488990783691,
20.5894756317139, 17.2119483947754, 41.6982574462891, 24.8543014526367,
26.7194156646729, 28.1328239440918, 29.2480430603027, 34.8775291442871,
17.3759765625, 29.4698066711426, 21.340311050415, 21.3308296203613,
27.0334072113037, 40.1140670776367, 25.7862873077393, 19.1311283111572,
25.0325565338135), adhd_pr_1000_2018 = c(24.2834396362305, 23.0037822723389,
33.9068183898926, 30.8641967773438, 39.0195579528809, 36.7909698486328,
27.0642204284668, 24.6901988983154, 39.1978950500488, 23.8095245361328,
13.8218154907227, 14.4400091171265, 39.7175636291504, 18.5994052886963,
29.9642810821533, 33.1638412475586, 21.2596340179443, 36.9249382019043
), county_label = structure(18:1, .Label = c("Trøndelag", "Finnmark",
"Troms", "Nordland", "Møre og Romsdal", "Sogn og Fjordane", "Hordaland",
"Rogaland", "Vest-Agder", "Aust-Agder", "Telemark", "Vestfold",
"Buskerud", "Oppland", "Hedmark", "Oslo", "Akershus", "Østfold"
), class = "factor"), adhd_2010 = c(32.0999984741211, 20.6000003814697,
17.2000007629395, 41.7000007629395, 24.8999996185303, 26.7000007629395,
28.1000003814697, 29.2000007629395, 34.9000015258789, 17.3999996185303,
29.5, 21.2999992370605, 21.2999992370605, 27, 40.0999984741211,
25.7999992370605, 19.1000003814697, 25), adhd_2018 = c(24.2999992370605,
23, 33.9000015258789, 30.8999996185303, 39, 36.7999992370605,
27, 24.7000007629395, 39.2000007629395, 23.7999992370605, 13.8000001907349,
14.3999996185303, 39.7000007629395, 18.6000003814697, 30, 33.2000007629395,
21.2999992370605, 37), diff = c(-7.79999923706055, 2.39999961853027,
16.7000007629395, -10.8000011444092, 14.1000003814697, 10.0999984741211,
-1.10000038146973, -4.5, 4.29999923706055, 6.39999961853027,
-15.6999998092651, -6.89999961853027, 18.4000015258789, -8.39999961853027,
-10.0999984741211, 7.40000152587891, 2.19999885559082, 12)), row.names = c(NA,
-18L), class = c("tbl_df", "tbl", "data.frame"))
Here an easy way to order your plot is to use arrange function from dplyr to sort your dataframe according column(s) of your choice and then format the grouping value (county_label) as factor with the ranked elements:
library(dplyr)
library(ggplot2)
DF %>% arrange(adhd_2010) %>% mutate(county_label = factor(county_label, unique(county_label))) %>%
ggplot(aes(x=adhd_2010, xend=adhd_2018, y=county_label, group=county_label)) +
#create a thick line between x and xend instead of using defaut
#provided by geom_dubbell
geom_segment(aes(x=adhd_2010,
xend=adhd_2018,
y=county_label,
yend=county_label),
color="#b2b2b2", size=1.5)+
geom_dumbbell(color="light blue",
size_x=3.5,
size_xend = 3.5,
#Note: there is no US:'color' for UK:'colour'
# in geom_dumbbel unlike standard geoms in ggplot()
colour_x="forestgreen", # green = 2010
colour_xend = "red")+ # red = 2018
labs(x=NULL, y=NULL,
title="Dumbbell Chart",
subtitle="Change in prescription rate: 2010 vs 2018")+
geom_text(color="black", size=2, hjust=-0.5,
aes(x=adhd_2010, label=adhd_2010))+
geom_text(aes(x=adhd_2018, label=adhd_2018),
color="black", size=2, hjust=1.5)
It is not perfect but at least your values are quite ordered. After you can change the column by which you wish to order your plot (here I order based on adhd_2010)
# Reformat data
DF2<-DF%>% arrange(desc(adhd_2010))
DF3<-DF%>% mutate("key" = "Change in Prescription Rate")
DF3$county_label<-factor(DF3$county_label,DF2$county_label)
DF3$adhd_2018<-signif(DF3$adhd_2018, digits = 3)
DF3$adhd_2010<-signif(DF3$adhd_2010, digits = 3)
# Plot
dumbbell::dumbbell(DF3, id="county_label", key="key", column1="adhd_2010", column2="adhd_2018", lab1="2010", lab2="2018", delt=1, textsize = 2, pt_val =1, ,pointsize = 2) + xlim(13,43) +
labs(x=NULL, y=NULL, title="Dumbbell Chart",subtitle="Change in prescription rate: 2010 vs 2018")
I added in a few bells and whistles, just toggle the options to remove
I hope someone finds it useful
Enjoy!

how to use an element of a character vector as a symbol argument to a function using non-standard evaluation := operator

I'm trying to write a function that accepts a character vector of variable names as symbolic arguments.
Here is some data taken from the "fertility" dataset in the questionr package. The important thing is that it includes some columns of labelled data.
library(tidyverse)
library(labelled)
df <- structure(list(id_woman = structure(c(391, 1643, 85, 881, 1981,
1072, 1978, 1607, 738), label = "Woman Id",
format.spss = "F8.0"),
weight = structure(c(1.80315, 1.80315, 1.80315, 1.80315,
1.80315, 0.997934, 0.997934, 0.997934, 0.192455),
label = "Sample weight", format.spss = "F8.2"),
residency = structure(c(2, 2, 2, 2, 2, 2, 2, 2, 2),
label = "Urban / rural residency",
labels = c(urban = 1, rural = 2),
class = "haven_labelled"),
region = structure(c(4, 4, 4, 4, 4, 3, 3, 3, 3), label = "Region",
labels = c(North = 1, East = 2, South = 3, West = 4),
class = "haven_labelled")),
row.names = c(NA, -9L), class = c("tbl_df", "tbl", "data.frame"))
This function simply takes a variable name and converts it from labelled data to a factor.
my.func <- function(var){
df %>%
mutate({{var}} := to_factor({{var}}))
}
Both of these lines work.
my.func(residency)
my.func("residency")
They return this:
id_woman weight residency region
<dbl> <dbl> <fct> <dbl+lbl>
1 391 1.80 rural 4 [West]
2 1643 1.80 rural 4 [West]
3 85 1.80 rural 4 [West]
4 881 1.80 rural 4 [West]
5 1981 1.80 rural 4 [West]
6 1072 0.998 rural 3 [South]
7 1978 0.998 rural 3 [South]
8 1607 0.998 rural 3 [South]
9 738 0.192 rural 3 [South]
The trouble comes if I try to provide the variable name as part of a vector, like this:
var.names <- c("residency", "region")
my.func(var.names[1])
Error: The LHS of `:=` must be a string or a symbol
Call `rlang::last_error()` to see a backtrace
I tried this, but it also failed.
my.func(rlang::sym(var.names[1]))
Error: The LHS of `:=` must be a string or a symbol
Call `rlang::last_error()` to see a backtrace
In this case, we have to evaluate (!!)
my.func(!!var.names[1])
# A tibble: 9 x 4
# id_woman weight residency region
# <dbl> <dbl> <fct> <dbl+lbl>
#1 391 1.80 residency 4 [West]
#2 1643 1.80 residency 4 [West]
#3 85 1.80 residency 4 [West]
#4 881 1.80 residency 4 [West]
#5 1981 1.80 residency 4 [West]
#6 1072 0.998 residency 3 [South]
#7 1978 0.998 residency 3 [South]
#8 1607 0.998 residency 3 [South]
#9 738 0.192 residency 3 [South]

dplyr summarise_all with quantile and other functions

I have a dataframe PatientA
Height Weight Age BMI
<dbl> <dbl> <dbl> <dbl>
1 161 72.2 27 27.9
2 164 61.0 21 22.8
3 171 72.0 30 24.6
4 169. 63.9 25 22.9
5 174. 64.4 27 21.1
6 160 50.9 22 19.9
7 172 77.5 22 26.3
8 165 54.5 22 20
9 173 82.4 29 27.5
10 169 76.6 22 26.9
and I would like to get some statistics for each column. I have the next working code which deals only with quantiles
genStat <- PatientsA %>%
summarise_all(funs(list(quantile(., probs = c(0.25, 0.5, 0.75))))) %>%
unnest %>%
transpose %>%
setNames(., c('25%', '50%', '75%')) %>%
map_df(unlist) %>%
bind_cols(data.frame(vars = names(PatientsA)), .)
and I need to add mean and sd to summarise_all like this
genStat <- PatientsA %>%
summarise_all(funs(mean,sd,list(quantile(., probs = c(0.25, 0.5, 0.75))))) %>%
unnest %>%
transpose %>%
setNames(., c('mean','sd','25%', '50%', '75%')) %>%
map_df(unlist) %>%
bind_cols(data.frame(vars = names(PatientsA)), .)
This straightforward approach fails returning the next error:
Error in names(object) <- nm : 'names' attribute [5] must be the
same length as the vector [3]
I'm a newbie in R, so what is the right syntax for completing this task?
This is what I would suggest. There is a little repetition in the code (calling quantile three times) but overall I think it is easier to understand and debug.
library(tidyverse)
PatientsA %>%
gather("variable", "value") %>%
group_by(variable) %>%
summarize(mean_val = mean(value),
sd_val = sd(value),
q25 = quantile(value, probs = .25),
q50 = quantile(value, probs = .5),
q75 = quantile(value, probs = .75))
## A tibble: 4 x 6
# variable mean_val sd_val q25 q50 q75
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 Age 24.7 3.33 22 23.5 27
#2 BMI 24.0 3.08 21.5 23.8 26.7
#3 Height 168. 5.01 164. 169 172.
#4 Weight 67.5 10.3 61.7 68.2 75.5
We could also place the quantile output in a list and then unnest
library(tidyverse)
PatientsA %>%
gather %>%
group_by(key) %>%
summarise_at(vars('value'),
funs(mean,
sd,
quantile = list(as.tibble(as.list(quantile(.,
probs = c(0.25, 0.5, 0.75))))))) %>%
unnest
# A tibble: 4 x 6
# key mean sd `25%` `50%` `75%`
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 Age 24.7 3.33 22 23.5 27
#2 BMI 24.0 3.08 21.5 23.8 26.7
#3 Height 168. 5.01 164. 169 172.
#4 Weight 67.5 10.3 61.7 68.2 75.5
Or using pivot_longer
PatientsA %>%
pivot_longer(cols = everything()) %>%
group_by(name) %>%
summarise(across(value, list(mean= ~ mean(., na.rm = TRUE),
sd = ~ sd(., na.rm = TRUE),
quantile = ~ list(as_tibble(as.list(quantile(.,
probs = c(0.25, 0.5, 0.75)))))))) %>%
unnest(c(value_quantile))
# A tibble: 4 x 6
name value_mean value_sd `25%` `50%` `75%`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Age 24.7 3.33 22 23.5 27
2 BMI 24.0 3.08 21.5 23.8 26.7
3 Height 168. 5.01 164. 169 172.
4 Weight 67.5 10.3 61.7 68.2 75.5
###data
PatientsA <- structure(list(Height = c(161, 164, 171, 169, 174, 160, 172,
165, 173, 169), Weight = c(72.2, 61, 72, 63.9, 64.4, 50.9, 77.5,
54.5, 82.4, 76.6), Age = c(27L, 21L, 30L, 25L, 27L, 22L, 22L,
22L, 29L, 22L), BMI = c(27.9, 22.8, 24.6, 22.9, 21.1, 19.9, 26.3,
20, 27.5, 26.9)), class = "data.frame", row.names = c("1", "2",
"3", "4", "5", "6", "7", "8", "9", "10"))

Resources