Get rows from a column per group based on a condition - r

I have a data.frame as shown below:
Basic requirement is to find average of "n" number of "value" after certain date per group.
For ex:, user provides:
Certain Date = Failure Date
n = 4
Hence, for A, the average would be (60+70+80+100)/4 ; ignoring NAs
and for B, the average would be (80+90+100)/3. Note for B, n=4 does not happen as there are only 3 values after the satisfied condition failuredate = valuedate.
Here is the dput:
structure(list(Name = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A",
"B"), class = "factor"), FailureDate = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("1/5/2020", "1/7/2020"), class = "factor"), ValueDate = structure(c(1L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 2L, 1L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 2L), .Label = c("1/1/2020", "1/10/2020", "1/2/2020",
"1/3/2020", "1/4/2020", "1/5/2020", "1/6/2020", "1/7/2020", "1/8/2020",
"1/9/2020"), class = "factor"), Value = c(10L, 20L, 30L, 40L,
NA, 60L, 70L, 80L, NA, 100L, 10L, 20L, 30L, 40L, 50L, 60L, 70L,
80L, 90L, 100L)), class = "data.frame", row.names = c(NA, -20L
))

We could create an index with cumsum after grouping by 'Name', extract the 'Value' elements and get the mean
library(dplyr)
n <- 4
df1 %>%
type.convert(as.is = TRUE) %>%
group_by(Name) %>%
summarise(Ave = mean(head(na.omit(Value[lag(cumsum(FailureDate == ValueDate),
default = 0) > 0]), n), na.rm = TRUE))
# A tibble: 2 x 2
# Name Ave
# <chr> <dbl>
#1 A 77.5
#2 B 90

You can convert factor dates to the Date object and then compute averages of "n" numbers after FailureDate per group. Note that "n" numbers should exclude NA, so tidyr::drop_na() is used here.
library(dplyr)
df %>%
mutate(across(contains("Date"), as.Date, "%m/%d/%Y")) %>%
tidyr::drop_na(Value) %>%
group_by(Name) %>%
summarise(mean = mean(Value[ValueDate > FailureDate][1:4], na.rm = T))
# # A tibble: 2 x 2
# Name mean
# <fct> <dbl>
# 1 A 77.5
# 2 B 90

You can try this:
library(dplyr)
n <- 4
df %>%
mutate(condition = as.character(FailureDate) == as.character(ValueDate))
group_by(Name) %>%
mutate(condition = cumsum(condition)) %>%
filter(condition == 1) %>%
slice(-1) %>%
filter(!is.na(Value)) %>%
slice(1:n) %>%
summarise(mean_col = mean(Value))
> df
# A tibble: 2 x 2
Name mean_col
<fct> <dbl>
1 A 77.5
2 B 90

Related

gtsummary modified cross tab

[![enter image description here][2]][2][![i need help in writing gstummary r code to produce following table output.dummy table shown in above table][2]][2]
i need help in writing gstummary r code to produce following table output.dummy table shown in above table
[![enter image description here][2]][2]
library(gtsummary)
[![enter image description here][2]][2]
[![enter image description here][3]][3]
id
age
sex
country
edu
ln
ivds
n2
p5
1
a
M
eng
x
45
15
40
15
2
a
M
eng
x
23
26
70
15
4
a
M
eng
x
26
36
35
40
5
b
F
eng
x
26
25
36
47
6
b
F
wal
y
45
45
60
12
7
b
M
wal
y
60
25
36
15
8
c
M
wal
y
70
08
25
36
9
c
F
sco
z
80
25
36
15
10
c
F
sco
z
90
25
26
39
structure(list(id = 1:15, age = structure(c(1L, 1L, 2L, 1L, 2L,
2L, 2L, 3L, 3L, 3L, 1L, 1L, 2L, 1L, 2L), .Label = c("a", "b",
"c"), class = "factor"), sex = structure(c(2L, 1L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L), .Label = c("F", "M"), class = "factor"),
country = structure(c(1L, 1L, 1L, 1L, 3L, 3L, 3L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 3L), .Label = c("eng", "scot", "wale"
), class = "factor"), edu = structure(c(1L, 1L, 1L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L), .Label = c("x",
"y", "z"), class = "factor"), lon = c(45L, 23L,
25L, 45L, 70L, 69L, 90L, 50L, 62L, 45L, 23L, 25L, 45L, 70L,
69L), is = c(15L, 26L, 36L, 34L, 2L, 4L, 5L, 8L, 9L,
15L, 26L, 36L, 34L, 2L, 4L), n2 = c(40L, 70L, 50L, 60L,
30L, 25L, 80L, 89L, 10L, 40L, 70L, 50L, 60L, 30L, 25L), p5 = c(15L,
20L, 36L, 48L, 25L, 36L, 28L, 15L, 25L, 15L, 20L, 36L, 48L,
25L, 36L)), row.names = c(NA, 15L), class = "data.frame")
[
I made a table similar to what you have above (more similar to the table you had before you updated it). But I think it'll get you most of the way there.
The type of table you're requesting it something that is in the works. In the meantime, you will need to use the bstfun::tbl_2way_summary() function. This function exists in another package while we work to make it better before integrating with gtsummary.
library(bstfun) # install with `remotes::install_github("ddsjoberg/bstfun")`
library(gtsummary)
packageVersion("gtsummary")
#> [1] '1.4.1'
# add a column that is all the same value
trial2 <- trial %>% mutate(constant = TRUE)
# loop over each continuous variable, construct table, then merge them together
tbls_row1 <-
c("age", "marker", "ttdeath") %>%
purrr::map(
~tbl_2way_summary(data = trial2, row = grade, col = constant, con = all_of(.x),
statistic = "{mean} ({sd}) - {min}, {max}") %>%
modify_header(stat_1 = paste0("**", .x, "**"))
) %>%
tbl_merge() %>%
modify_spanning_header(everything() ~ NA)
# repeat for the second row
tbls_row2 <-
c("age", "marker", "ttdeath") %>%
purrr::map(
~tbl_2way_summary(data = trial2, row = stage, col = constant, con = all_of(.x),
statistic = "{mean} ({sd}) - {min}, {max}") %>%
modify_header(stat_1 = paste0("**", .x, "**"))
) %>%
tbl_merge() %>%
modify_spanning_header(everything() ~ NA)
# stack these tables
tbl_stacked <- tbl_stack(list(tbls_row1, tbls_row2))
# lastly, add calculated summary stats for categorical variables, and merge them
tbl_summary_stats <-
trial2 %>%
tbl_summary(
include = c(grade, stage),
missing = "no"
) %>%
modify_header(stat_0 ~ "**n (%)**") %>%
modify_footnote(everything() ~ NA)
tbl_final <-
tbl_merge(list(tbl_summary_stats, tbl_stacked)) %>%
modify_spanning_header(everything() ~ NA) %>%
# column spanning column headers
modify_spanning_header(
list(c(stat_1_1_2, stat_1_2_2) ~ "**Group 1**",
stat_1_3_2 ~ "**Group 2**")
)
Created on 2021-07-10 by the reprex package (v2.0.0)

Find value for each subject and save as new table

I have a data frame with long format data as follows
ID Frame.No ROI.No Flux.med
01 1 1 78
01 1 2 76
01 2 1 80
01 2 2 80
01 3 1 89
01 3 2 80
27 1 1 60
27 1 2 68
27 4 1 80
27 4 2 89
For each "ID" I want to get the first and maximum Flux.med for both ROI 1 and 2 and put all these in a new dataframe. If I have a dataframe with just one subject (e.g. ID 01) I am able to identify the Flux.med values I need using the following code:
ROI1.baseline <- mydata %>%
filter(ROI.No == "ROI 1" & Frame.No == min(Frame.No))%>%
select(Flux.Med)
ROI1.max <- mydata%>%
filter(ROI.No == "ROI 1")%>%
filter (Flux.Med == max(Flux.Med))%>%
select(Flux.Med)
ROI2.baseline <- mydata%>%
filter(ROI.No == "ROI 2" & Frame.No == min(Frame.No))%>%
select(Flux.Med)
ROI.max <- mydata%>%
filter(ROI.No == "ROI 2")%>%
filter (Flux.Med == max(Flux.Med))%>%
select(Flux.Med)
But I need to do that for each ID and save the results in a dataframe.
Can I do this with a for loop?
We can get first and max value in each ID and ROI.No.
library(dplyr)
mydata %>%
group_by(ID, ROI.No) %>%
summarise(first_flux = first(Flux.med),
max_flux = max(Flux.med))
# ID ROI.No first_flux max_flux
# <int> <int> <int> <int>
#1 1 1 78 89
#2 1 2 76 80
#3 27 1 60 80
#4 27 2 68 89
Or using aggregate :
aggregate(Flux.med~ID + ROI.No, mydata, function(x) c(first = x[1], max = max(x)))
data
mydata <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 27L, 27L, 27L,
27L), Frame.No = c(1L, 1L, 2L, 2L, 3L, 3L, 1L, 1L, 4L, 4L), ROI.No = c(1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), Flux.med = c(78L, 76L, 80L,
80L, 89L, 80L, 60L, 68L, 80L, 89L)), class = "data.frame", row.names = c(NA,-10L))
We can use data.table
library(data.table)
setDT(df1)[, .(first_flux = first(Flux.med),
max_flux = max(Flux.med)), .(ID, ROI.No)]
data
df1 <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 27L, 27L, 27L,
27L), Frame.No = c(1L, 1L, 2L, 2L, 3L, 3L, 1L, 1L, 4L, 4L), ROI.No = c(1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), Flux.med = c(78L, 76L, 80L,
80L, 89L, 80L, 60L, 68L, 80L, 89L)), class = "data.frame",
row.names = c(NA,-10L))
Thanks for the suggestions. So this is how I did it in the end:
ROI1.baseline <- ldi_data %>%
group_by(ID)%>%
filter(ROI.No == "ROI 1" & Frame.No == min(Frame.No))%>% ###uses lowest number frame as baseline (not necessarily frame 1 if it was excluded)
select(Flux.Med)%>%
dplyr::rename(ROI1_baseline = Flux.Med)%>%
as.data.frame(ROI1.baseline)
ROI1.max <- ldi_data%>%
group_by(ID)%>%
filter(ROI.No == "ROI 1")%>%
filter (Flux.Med == max(Flux.Med))%>%
select(Flux.Med)%>%
dplyr::rename(ROI1_max = Flux.Med)%>%
as.data.frame(ROI1.max)
ROI2.baseline <- ldi_data%>%
group_by(ID)%>%
filter(ROI.No == "ROI 2" & Frame.No == min(Frame.No))%>%
select(Flux.Med)%>%
dplyr::rename(ROI2_baseline = Flux.Med)%>%
as.data.frame(ROI2.baseline)
ROI2.max <- ldi_data%>%
group_by(ID)%>%
filter(ROI.No == "ROI 2")%>%
filter (Flux.Med == max(Flux.Med))%>%
select(Flux.Med)%>%
dplyr::rename(ROI2_max = Flux.Med)%>%
as.data.frame(ROI2.max)
summary <- Reduce(merge, list(ROI1.baseline, ROI1.max, ROI2.baseline, ROI2.max))

How to make multiple boxplots by two different groups in one graph?

The part of dataset is like this:
Treatment Status gene1 gene2
1 Both Deceased 3.1934860 63.8697194
2 Both Deceased 0.0000000 11.3436426
3 Chemo Deceased 7.2186817 35.0621681
4 Both Deceased 7.2186817 23.7185255
5 Chemo Deceased 0.8049256 17.7083638
6 Chemo Censored 0.8250437 0.8250437
7 Chemo Censored 3.4136505 23.895533
8 Radio Censored 0.9428735 4.7143673
9 None Censored 3.3001750 10.7255686
I want to make compare each gene expression in "deceased" vs "censored" for each treatment. I only could make one gene expression for now, which is like this:
ggboxplot(df, x="Treatment", y= "gene1", fill = "Status")
Is there any way I can combine two genes' boxplots in one graph? Or any other better way to show these genes expression level difference between deceased vs censored in each group?
We may use boxplot() in base R, where we need to use reshape() first to get a long format.
boxplot(gene ~ Status + time + Treatment,
reshape(cbind(id=rownames(dat), dat), 4:5, sep="", direction="long"),
border=1:2)
However, this yields a quite crowded plot. We could do separate boxplots for e.g. each treatment group using sapply().
par(mfrow=c(2, 2))
sapply(unique(dat$Treatment), function(x) {
boxplot(value ~ Status + gene,
reshape(cbind(id=rownames(dat[dat$Treatment == x, ]), dat[dat$Treatment == x, ]),
4:5, sep="", direction="long", v.names="value", timevar="gene"),
at=c(1:2, 4:5),
main=x,
border=1:2)
})
Result
Data
dat <- structure(list(Treatment = structure(c(1L, 1L, 1L, 2L, 2L, 2L,
3L, 3L, 3L, 4L, 4L, 4L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L,
4L, 4L), .Label = c("Both", "Chemo", "None", "Radio"), class = "factor"),
Status = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("Censored", "Deceased"), class = "factor"),
gene1 = c(2.83185327992901, 5.21658677992433, 9.36719279899948,
1.77809421116808, 6.39453760571561, 3.08376117126782, -1.99524072673447,
0.380722587753265, -0.947148460332481, 1.73014054712629,
0.855919162512028, 0.501667581598007, 0.0638735169737497,
10.1712355237258, 5.34317645471502, -7.96626158445742, -0.0781613844302278,
5.59930916967042, -0.725717330717595, 0.492793009977729,
-0.546677404630108, 0.290301979542245, 2.83540215865274,
-1.25738031049913), gene2 = c(6.97361394841868, -6.86012827859373,
-0.193731972798249, -5.64669185350061, -20.6664537342379,
32.5477488386544, 12.6210452154023, 6.56845245925654, 13.5491140544121,
-2.9113829554538, 2.90958200298303, -6.56806056188421, 50.2577234864485,
17.0734922804668, 49.0769939658538, -2.0186433516603, 32.3823429023035,
17.7654319738005, 12.2884241568455, 21.7600566866782, 19.68978862329,
-12.6277420840716, 27.555120882401, 17.5164450232983)), row.names = c(3L,
23L, 13L, 44L, 34L, 50L, 90L, 67L, 62L, 100L, 95L, 96L, 132L,
144L, 124L, 174L, 171L, 168L, 196L, 205L, 207L, 233L, 229L, 212L
), class = "data.frame")
using the data from jay.sf you can try a 'ggplot'. I'm using the tidyverse, but this is not required.
library(tidyverse)
dat %>%
as_tibble() %>%
gather(gene, mRNA, -Treatment, -Status) %>%
ggplot(aes(Status, mRNA, fill =gene)) +
geom_boxplot() +
facet_wrap(~Treatment, ncol = 2, scales = "free_y")
and with facet_grid you can add significance levels automatically
dat %>%
as_tibble() %>%
gather(gene, mRNA, -Treatment, -Status) %>%
ggplot(aes(gene, mRNA, fill =gene)) +
geom_boxplot(show.legend = F) +
ggbeeswarm::geom_beeswarm(show.legend = F) +
ggsignif::geom_signif(comparisons = list(c("gene1", "gene2"))) +
facet_grid(Status~Treatment, scales = "free_y")

how can I subtract specific region from each block

I have a data with several column
df<- structure(list(X1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), X2 = structure(c(1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L,
18L, 19L, 20L, 21L, 22L, 23L, 24L, 7L, 8L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L,
19L, 20L, 21L, 22L, 23L, 24L), .Label = c("B02", "B03", "B04",
"B05", "B06", "B07", "C02", "C03", "C04", "C05", "C06", "C07",
"D02", "D03", "D04", "D05", "D06", "D07", "G02", "G03", "G04",
"G05", "G06", "G07"), class = "factor"), X3 = c(0.005648642,
0.005876389, 0.00592532, 0.006244456, 0.005987075, 0.006075874,
0.006198667, 0.006003758, 0.006041885, 0.006186987, 0.006041323,
0.006071594, 0.005902391, 0.005976096, 0.00593805, 0.005866524,
0.0059831, 0.005902586, 0.005914309, 0.005887304, 0.006054509,
0.005931266, 0.005936195, 0.005895191, 0.005840959, 0.005849247,
0.005808851, 0.005833586, 0.005825153, 0.00584873, 0.005983976,
0.00598669, 0.006011548, 0.005997747, 0.005851022, 0.005919044,
0.005854566, 0.0058226, 0.00578052, 0.005784874, 0.005933198,
0.005996407, 0.005898848, 0.00595775, 0.005918857, 0.005882898,
0.005877808, 0.005803604, 0.006235161, 0.005808725), X4 = c(0.024054157,
0.025850824, 0.023122678, 0.042337945, 0.030468744, 0.026481616,
0.017430149, 0.024019931, 0.025572552, 0.024869532, 0.024148692,
0.025228634, 0.030078166, 0.025860944, 0.023530696, 0.029244585,
0.026599876, 0.023578245, 0.024014744, 0.023963795, 0.025466639,
0.02969377, 0.02307532, 0.022739164, 0.02860112, 0.022800416,
0.022287748, 0.022491258, 0.023340693, 0.024576665, 0.024378624,
0.030037462, 0.024904579, 0.025901291, 0.02912765, 0.024926085,
0.024044815, 0.023799791, 0.023147189, 0.021253484, 0.023979501,
0.029659496, 0.029393487, 0.02470603, 0.024562731, 0.023819856,
0.025065129, 0.023228642, 0.023919905, 0.024781896), X5 = c(0.00535592,
0.00555428, 0.00555428, 0.00572213, 0.00573739, 0.00575265, 0.00576791,
0.00572213, 0.00573739, 0.00572213, 0.00575265, 0.00576791, 0.0056611,
0.0056611, 0.00567636, 0.00563058, 0.0056611, 0.00564584, 0.00563058,
0.00561532, 0.00575265, 0.00569162, 0.00567636, 0.00564584, 0.00561532,
0.00560006, 0.00556954, 0.0055848, 0.00555428, 0.00556954, 0.00569162,
0.00573739, 0.00572213, 0.00567636, 0.00561532, 0.00561532, 0.0055848,
0.00553903, 0.00552377, 0.00549325, 0.0056611, 0.00572213, 0.0056611,
0.0056611, 0.00563058, 0.00561532, 0.0055848, 0.00553903, 0.00553903,
0.00550851)), .Names = c("X1", "X2", "X3", "X4", "X5"), class = "data.frame", row.names = c(NA,
-50L))
basically, I am trying to correct each value based on average of a several values
I want to take average of G02, G03, G04 and G05 and then subtract it from each value for that column if the X1 is 1 or 2 or whatever number has
for example lets look at the X3
take average of
0.005914309
0.005887304
0.006054509
0.005931266
The average of this will be 0.005946847. Then I subtract it from the first value with having X1 as 1. It becomes 0.005648642 -0.005946847 =-0.000298205
Then subtract from each of the values in that column where the X1 is 1
The same Take average of the G02, G03, G04, G05 when the X1 is 2 and subtract it from each value of that column when the X1 is 2 etc etc
Here is an option using data.table join
library(data.table)
nm1 <- paste0("X", 3:5)
nm2 <- paste0("G0", 2:5)
dfN <- copy(df)
setDT(dfN)[dfN[X2 %in% nm2, lapply(.SD, function(x) mean(x)),
by = .(X1), .SDcols = X3:X5], (nm1) := Map(`-`, mget(nm1), mget(paste0("i.", nm1))),
on = .(X1)]
head(dfN, 10)
# X1 X2 X3 X4 X5
# 1: 1 B02 -0.000298205 -0.001730580 -0.0003166225
# 2: 1 B03 -0.000070458 0.000066087 -0.0001182625
# 3: 1 B04 -0.000021527 -0.002662059 -0.0001182625
# 4: 1 B05 0.000297609 0.016553208 0.0000495875
# 5: 1 B06 0.000040228 0.004684007 0.0000648475
# 6: 1 B07 0.000129027 0.000696879 0.0000801075
# 7: 1 C02 0.000251820 -0.008354588 0.0000953675
# 8: 1 C03 0.000056911 -0.001764806 0.0000495875
# 9: 1 C04 0.000095038 -0.000212185 0.0000648475
#10: 1 C05 0.000240140 -0.000915205 0.0000495875
A solution using dplyr. df2 is the mean from G02 to G05. df3 is the final output.
library(dplyr)
df2 <- df %>%
filter(X2 %in% paste0("G0", 2:5)) %>%
group_by(X1) %>%
summarise_at(vars(-X2), funs(mean(.))) %>%
gather(Col, Value, -X1)
df3 <- df %>%
group_by(X1) %>%
mutate(ID = 1:n()) %>%
gather(Col, Value, -ID, -X1, -X2) %>%
left_join(df2, by = c("X1", "Col")) %>%
mutate(Value = Value.x - Value.y) %>%
select(ID, X1, X2, Col, Value) %>%
spread(Col, Value) %>%
arrange(X1, ID) %>%
select(-ID) %>%
ungroup()
df3
# A tibble: 50 x 5
X1 X2 X3 X4 X5
<int> <fctr> <dbl> <dbl> <dbl>
1 1 B02 -0.000298205 -0.001730580 -0.0003166225
2 1 B03 -0.000070458 0.000066087 -0.0001182625
3 1 B04 -0.000021527 -0.002662059 -0.0001182625
4 1 B05 0.000297609 0.016553208 0.0000495875
5 1 B06 0.000040228 0.004684007 0.0000648475
6 1 B07 0.000129027 0.000696879 0.0000801075
7 1 C02 0.000251820 -0.008354588 0.0000953675
8 1 C03 0.000056911 -0.001764806 0.0000495875
9 1 C04 0.000095038 -0.000212185 0.0000648475
10 1 C05 0.000240140 -0.000915205 0.0000495875
# ... with 40 more rows

dplyr mean group on Long Format Data

I have trouble figuring out how I can compute a simple mean with dplyr on Long Format data.
My data look like this :
hldid idno sex diary age
1 1294 1294_1 2 1 39
2 1294 1294_1 2 2 39
3 1294 1294_2 1 1 43
4 1294 1294_2 1 2 43
...
With 4 variables : hldid idno sex diary age
idno is the personal identifier but not the unique key.
Each individual is repeated 2 times, one for each diary filled.
What I would like is to simply compute the age mean by sex.
Could you help me out ?
I tried something like :
dta %>%
group_by(sex) %>%
mutate( ng = n_distinct(idno)) %>%
group_by(age, add=TRUE) %>%
summarise(mean = n()/ng[1] )
But it does not work.
The data :
dta = structure(list(hldid = c(1294, 1294, 1294, 1294, 1352, 1352,
1352, 1352, 3741, 3741, 3741, 3741, 3809, 3809, 3809, 3809, 4037,
4037, 4037, 4037), idno = c("1294_1", "1294_1", "1294_2", "1294_2",
"1352_1", "1352_1", "1352_2", "1352_2", "3741_1", "3741_1", "3741_2",
"3741_2", "3809_1", "3809_1", "3809_2", "3809_2", "4037_1", "4037_1",
"4037_2", "4037_2"), sex = c(2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L,
2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L), diary = c(1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L), age = c(39L, 39L, 43L, 43L, 31L, 31L, 37L, 37L,
33L, 33L, 37L, 37L, 34L, 34L, 37L, 37L, 41L, 41L, 32L, 32L)), .Names = c("hldid",
"idno", "sex", "diary", "age"), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"), row.names = c(NA, -20L), vars = list(hldid), drop = TRUE, indices = list(
0:3, 4:7, 8:11, 12:15, 16:19), group_sizes = c(4L, 4L, 4L,
4L, 4L), biggest_group_size = 4L, labels = structure(list(hldid = c(1294,
1352, 3741, 3809, 4037)), class = "data.frame", row.names = c(NA,
-5L), .Names = "hldid", vars = list(hldid)))
quick update
Maybe this does not apply for this example,
but this kind of issues I have in mind is the following :
Imagine we have data like this :
3 women and 2 men, and a dummy act variable.
If we do and not taking into account the Long format computing the mean, we will have problems.
aggregate(act ~ sex, FUN = mean, data = dtaTime)
What we should do is this :
aggregate(act ~ sex, FUN = sum, data = dtaTime)
6 / 2 # men
10 / 3 # women
Data
dtaTime = structure(list(id = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L),
sex = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), act = c(1L,
1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L)), .Names = c("id", "sex",
"act"), class = "data.frame", row.names = c(NA, -25L))
You are making it too complicated,
dta %>%
group_by(sex) %>%
summarise(meanage = mean(age))
should give you the mean age by sex.
A base R alternative:
aggregate(age ~ sex, dta, mean)
A data.table alternative:
library(data.table)
setDT(dta)[, .(meanage = mean(age)), by = sex]

Resources