I have a dataframe from Alzheimer disease patients. I would like to create a table with counts for the number of patients (indicated by patient ID: iid) with age_at_onset <75, <70 for each status (1,2,3) per Ethnicity. How can I do this in R?
df <- structure(list(iid = structure(c(`5068` = 80L, `15562` = 58L,
`8939` = 52L, `17602` = 34L, `3173` = 40L, `12591` = 30L, `17391` = 97L,
`8241` = 93L, `9746` = 10L, `9673` = 7L, `16594` = 29L, `16911` = 60L,
`4796` = 18L, `6598` = 12L, `11462` = 26L, `16425` = 17L, `12698` = 37L,
`17118` = 81L, `1501` = 76L, `13294` = 92L, `8072` = 84L, `11642` = 46L,
`4164` = 85L, `9035` = 62L, `16691` = 35L, `16002` = 86L, `3915` = 21L,
`7409` = 54L, `9759` = 11L, `6130` = 6L, `15153` = 23L, `13539` = 100L,
`13262` = 87L, `742` = 28L, `17592` = 33L, `16812` = 53L, `213` = 66L,
`11963` = 77L, `12093` = 89L, `11910` = 68L, `15813` = 73L, `1104` = 51L,
`1966` = 95L, `5589` = 61L, `8860` = 41L, `482` = 16L, `3967` = 55L,
`5869` = 1L, `12435` = 20L, `11675` = 50L, `16701` = 36L, `5893` = 2L,
`16880` = 57L, `13290` = 90L, `1097` = 49L, `1476` = 71L, `9100` = 67L,
`6220` = 8L, `15393` = 42L, `16631` = 31L, `9641` = 4L, `13485` = 99L,
`1028` = 44L, `8200` = 91L, `12190` = 94L, `5581` = 19L, `7266` = 43L,
`12254` = 98L, `15763` = 69L, `17764` = 79L, `16239` = 96L, `7548` = 59L,
`12037` = 83L, `7813` = 70L, `12943` = 63L, `17748` = 75L, `12703` = 38L,
`11964` = 78L, `14018` = 45L, `1769` = 88L, `13713` = 22L, `13100` = 74L,
`13866` = 32L, `2527` = 25L, `2281` = 15L, `4463` = 39L, `5815` = 14L,
`14040` = 47L, `16560` = 24L, `12887` = 56L, `11167` = 13L, `6123` = 5L,
`5668` = 48L, `3036` = 82L, `7622` = 65L, `11470` = 27L, `4770` = 64L,
`17050` = 72L, `6295` = 9L, `9575` = 3L), .Label = c("08AD09051_NACC295883",
"08AD10766_NACC977458", "08AD9133", "09AD14006", "09AD14313_NACC904765",
"09AD14360_NACC785663", "09AD14874", "09AD14943_NACC009736",
"09AD15417_NACC169039", "09AD15778", "09AD15810", "09AD17022_NACC426380",
"25795", "NACC026302", "NACC026743", "NACC044624", "NACC062886",
"NACC083669", "NACC088187", "NACC094571", "NACC107551", "NACC134929",
"NACC178119", "NACC178349", "NACC183751", "NACC186606", "NACC192719",
"NACC193548", "NACC209758", "NACC224665", "NACC243923", "NACC246256",
"NACC261383", "NACC283729", "NACC298544", "NACC305567", "NACC310219",
"NACC310896", "NACC312856", "NACC336802", "NACC342957", "NACC350799",
"NACC351234_09AD13080", "NACC355338", "NACC355951", "NACC361682",
"NACC369873", "NACC397276", "NACC402765", "NACC403144", "NACC407162",
"NACC412031", "NACC413408", "NACC422516_08AD10849", "NACC436908",
"NACC465387", "NACC472288", "NACC479723", "NACC485644_08AD8204",
"NACC504120", "NACC508353", "NACC509594", "NACC510498", "NACC519864",
"NACC521718_08AD9198", "NACC559675", "NACC585997", "NACC605438",
"NACC612578", "NACC619036_09AD14621", "NACC621261", "NACC634809",
"NACC635885", "NACC639654", "NACC640099", "NACC642393", "NACC660918",
"NACC660981", "NACC684037", "NACC690933", "NACC695603", "NACC703758",
"NACC740374", "NACC744168_08AD7716", "NACC766835", "NACC769330",
"NACC775129", "NACC792439", "NACC796641", "NACC805995", "NACC806269_09AD13056",
"NACC809589", "NACC824113_08AD9038", "NACC884140", "NACC916661",
"NACC921664", "NACC926195", "NACC929277", "NACC959601", "NACC992086"
), class = "factor"), omit = structure(c(`5068` = 1L, `15562` = 1L,
`8939` = 1L, `17602` = 1L, `3173` = 1L, `12591` = 2L, `17391` = 1L,
`8241` = 1L, `9746` = 1L, `9673` = 2L, `16594` = 2L, `16911` = 2L,
`4796` = 1L, `6598` = 2L, `11462` = 1L, `16425` = 1L, `12698` = 1L,
`17118` = 1L, `1501` = 1L, `13294` = 1L, `8072` = 1L, `11642` = 2L,
`4164` = 1L, `9035` = 1L, `16691` = 1L, `16002` = 1L, `3915` = 1L,
`7409` = 1L, `9759` = 1L, `6130` = 1L, `15153` = 1L, `13539` = 2L,
`13262` = 1L, `742` = 2L, `17592` = 1L, `16812` = 1L, `213` = 2L,
`11963` = 2L, `12093` = 2L, `11910` = 2L, `15813` = 1L, `1104` = 1L,
`1966` = 1L, `5589` = 1L, `8860` = 1L, `482` = 1L, `3967` = 1L,
`5869` = 2L, `12435` = 1L, `11675` = 2L, `16701` = 1L, `5893` = 1L,
`16880` = 2L, `13290` = 2L, `1097` = 1L, `1476` = 1L, `9100` = 1L,
`6220` = 1L, `15393` = 1L, `16631` = 1L, `9641` = 1L, `13485` = 2L,
`1028` = 1L, `8200` = 2L, `12190` = 1L, `5581` = 2L, `7266` = 1L,
`12254` = 1L, `15763` = 1L, `17764` = 1L, `16239` = 1L, `7548` = 1L,
`12037` = 1L, `7813` = 1L, `12943` = 2L, `17748` = 1L, `12703` = 1L,
`11964` = 1L, `14018` = 1L, `1769` = 1L, `13713` = 1L, `13100` = 1L,
`13866` = 2L, `2527` = 1L, `2281` = 1L, `4463` = 1L, `5815` = 1L,
`14040` = 1L, `16560` = 2L, `12887` = 1L, `11167` = 2L, `6123` = 2L,
`5668` = 1L, `3036` = 1L, `7622` = 1L, `11470` = 1L, `4770` = 1L,
`17050` = 2L, `6295` = 2L, `9575` = 1L), .Label = c("0", "1"), class = "factor"),
sex = structure(c(1L, 1L, 1L, 2L, 1L, NA, 2L, 2L, 1L, NA,
2L, 2L, 1L, NA, 2L, 2L, 2L, 2L, 2L, 2L, 2L, NA, 2L, 1L, 1L,
2L, 2L, 2L, 1L, 1L, 2L, NA, 2L, 2L, 1L, 2L, 2L, NA, NA, NA,
2L, 1L, 2L, 1L, 1L, 2L, 2L, NA, 2L, NA, 1L, 2L, 1L, NA, 2L,
1L, 2L, 2L, 2L, 2L, 1L, NA, 2L, NA, 2L, 2L, 2L, 2L, 2L, 1L,
2L, 2L, 2L, 1L, NA, 2L, 2L, 1L, 2L, 1L, 1L, 2L, NA, 1L, 2L,
1L, 2L, 2L, 1L, 1L, NA, NA, 1L, 1L, 2L, 2L, 2L, 2L, NA, 2L
), .Label = c(" 1", " 2", "-9"), class = "factor"), status = structure(c(2L,
2L, 2L, 1L, 3L, NA, 2L, 2L, 2L, NA, 2L, NA, 1L, NA, 2L, 1L,
1L, 1L, 2L, 1L, 2L, NA, 3L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L,
NA, 1L, 2L, 2L, 2L, NA, NA, NA, NA, 2L, 1L, 1L, 3L, 2L, 1L,
3L, NA, 1L, NA, 2L, 1L, NA, NA, 1L, 2L, 2L, 1L, 1L, 1L, 2L,
NA, 1L, NA, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, NA, 1L,
1L, 2L, 2L, 2L, 1L, 1L, NA, 3L, 1L, 3L, 3L, 1L, NA, 2L, NA,
NA, 3L, 2L, 2L, 1L, 2L, 2L, NA, 2L), .Label = c(" 1", " 2",
" 3", "-9"), class = "factor"), age_at_onset = structure(c(`5068` = 4L,
`15562` = 16L, `8939` = 24L, `17602` = NA, `3173` = 24L,
`12591` = NA, `17391` = 15L, `8241` = 13L, `9746` = 18L,
`9673` = NA, `16594` = 20L, `16911` = NA, `4796` = NA, `6598` = NA,
`11462` = 20L, `16425` = NA, `12698` = NA, `17118` = NA,
`1501` = 5L, `13294` = NA, `8072` = 11L, `11642` = NA, `4164` = 25L,
`9035` = NA, `16691` = NA, `16002` = NA, `3915` = NA, `7409` = 21L,
`9759` = 14L, `6130` = NA, `15153` = NA, `13539` = NA, `13262` = NA,
`742` = 26L, `17592` = 28L, `16812` = 9L, `213` = 14L, `11963` = NA,
`12093` = NA, `11910` = NA, `15813` = 10L, `1104` = NA, `1966` = NA,
`5589` = 16L, `8860` = 8L, `482` = NA, `3967` = 7L, `5869` = NA,
`12435` = NA, `11675` = NA, `16701` = 19L, `5893` = NA, `16880` = 22L,
`13290` = NA, `1097` = NA, `1476` = 7L, `9100` = 22L, `6220` = NA,
`15393` = NA, `16631` = NA, `9641` = NA, `13485` = NA, `1028` = NA,
`8200` = NA, `12190` = NA, `5581` = NA, `7266` = 17L, `12254` = 17L,
`15763` = NA, `17764` = 6L, `16239` = NA, `7548` = 14L, `12037` = 27L,
`7813` = 26L, `12943` = NA, `17748` = NA, `12703` = NA, `11964` = 20L,
`14018` = 23L, `1769` = 25L, `13713` = NA, `13100` = NA,
`13866` = NA, `2527` = 12L, `2281` = NA, `4463` = 1L, `5815` = 3L,
`14040` = NA, `16560` = NA, `12887` = 14L, `11167` = NA,
`6123` = NA, `5668` = 5L, `3036` = 2L, `7622` = 7L, `11470` = NA,
`4770` = 17L, `17050` = 15L, `6295` = NA, `9575` = 19L), .Label = c("44",
"52", "56", "58", "60", "61", "62", "64", "65", "66", "67",
"69", "70", "71", "72", "73", "74", "75", "76", "77", "78",
"79", "80", "81", "82", "83", "88", "90"), class = "factor"),
age_last_visit = structure(c(`5068` = 8L, `15562` = 18L,
`8939` = 24L, `17602` = 16L, `3173` = 21L, `12591` = NA,
`17391` = 17L, `8241` = NA, `9746` = NA, `9673` = NA, `16594` = 25L,
`16911` = 4L, `4796` = 5L, `6598` = NA, `11462` = 21L, `16425` = 10L,
`12698` = 25L, `17118` = 12L, `1501` = 7L, `13294` = 9L,
`8072` = NA, `11642` = NA, `4164` = 21L, `9035` = 21L, `16691` = 3L,
`16002` = 14L, `3915` = 13L, `7409` = NA, `9759` = NA, `6130` = 25L,
`15153` = 22L, `13539` = NA, `13262` = 24L, `742` = 26L,
`17592` = 30L, `16812` = 9L, `213` = 11L, `11963` = NA, `12093` = NA,
`11910` = NA, `15813` = 10L, `1104` = 24L, `1966` = 14L,
`5589` = 18L, `8860` = 23L, `482` = 15L, `3967` = 7L, `5869` = NA,
`12435` = 6L, `11675` = NA, `16701` = 25L, `5893` = NA, `16880` = 20L,
`13290` = NA, `1097` = 8L, `1476` = 5L, `9100` = 28L, `6220` = 21L,
`15393` = 17L, `16631` = 9L, `9641` = 24L, `13485` = NA,
`1028` = 7L, `8200` = NA, `12190` = 8L, `5581` = 15L, `7266` = NA,
`12254` = 19L, `15763` = 7L, `17764` = 6L, `16239` = 11L,
`7548` = NA, `12037` = 29L, `7813` = NA, `12943` = NA, `17748` = 23L,
`12703` = 27L, `11964` = 23L, `14018` = 26L, `1769` = 24L,
`13713` = 13L, `13100` = 20L, `13866` = NA, `2527` = 13L,
`2281` = 21L, `4463` = 4L, `5815` = 3L, `14040` = 2L, `16560` = 14L,
`12887` = 24L, `11167` = NA, `6123` = NA, `5668` = 12L, `3036` = 1L,
`7622` = NA, `11470` = 18L, `4770` = 18L, `17050` = 18L,
`6295` = NA, `9575` = NA), .Label = c("59", "60", "61", "62",
"64", "65", "67", "68", "69", "70", "71", "72", "73", "74",
"75", "76", "77", "79", "80", "81", "82", "83", "84", "85",
"86", "89", "91", "92", "93", "94"), class = "factor"), age_at_death = structure(c(`5068` = 2L,
`15562` = NA, `8939` = NA, `17602` = NA, `3173` = NA, `12591` = NA,
`17391` = NA, `8241` = 10L, `9746` = 9L, `9673` = NA, `16594` = NA,
`16911` = NA, `4796` = NA, `6598` = NA, `11462` = NA, `16425` = NA,
`12698` = NA, `17118` = NA, `1501` = NA, `13294` = NA, `8072` = 6L,
`11642` = NA, `4164` = NA, `9035` = NA, `16691` = NA, `16002` = NA,
`3915` = NA, `7409` = 16L, `9759` = 8L, `6130` = NA, `15153` = NA,
`13539` = NA, `13262` = NA, `742` = 14L, `17592` = NA, `16812` = NA,
`213` = NA, `11963` = NA, `12093` = NA, `11910` = NA, `15813` = NA,
`1104` = NA, `1966` = NA, `5589` = NA, `8860` = NA, `482` = NA,
`3967` = NA, `5869` = NA, `12435` = NA, `11675` = NA, `16701` = NA,
`5893` = 16L, `16880` = NA, `13290` = NA, `1097` = NA, `1476` = 1L,
`9100` = NA, `6220` = NA, `15393` = NA, `16631` = NA, `9641` = NA,
`13485` = NA, `1028` = NA, `8200` = NA, `12190` = NA, `5581` = NA,
`7266` = 11L, `12254` = NA, `15763` = NA, `17764` = 3L, `16239` = NA,
`7548` = 6L, `12037` = 15L, `7813` = 13L, `12943` = NA, `17748` = NA,
`12703` = NA, `11964` = NA, `14018` = NA, `1769` = 12L, `13713` = NA,
`13100` = NA, `13866` = NA, `2527` = 5L, `2281` = NA, `4463` = NA,
`5815` = NA, `14040` = NA, `16560` = NA, `12887` = NA, `11167` = NA,
`6123` = NA, `5668` = NA, `3036` = NA, `7622` = 4L, `11470` = NA,
`4770` = NA, `17050` = NA, `6295` = NA, `9575` = 7L), .Label = c("66",
"70", "71", "73", "74", "75", "77", "79", "82", "83", "85",
"86", "88", "90", "93", "94"), class = "factor"), aaoaae = structure(c(3L,
16L, 24L, 19L, 25L, NA, 15L, 13L, 18L, NA, 20L, 6L, 7L, NA,
20L, 13L, 29L, 15L, 4L, 12L, 10L, NA, 25L, NA, 5L, 17L, 16L,
21L, 14L, 29L, 26L, NA, 28L, 26L, 31L, 8L, 14L, NA, NA, NA,
9L, 28L, 17L, 22L, 7L, 18L, 10L, NA, 8L, NA, 19L, 33L, 24L,
NA, 11L, 6L, 22L, 25L, 20L, 12L, NA, NA, 10L, NA, 11L, 18L,
17L, 17L, 10L, 5L, 14L, 14L, 30L, 26L, NA, 27L, 32L, 20L,
23L, 25L, 16L, 24L, NA, 16L, 25L, 6L, 5L, 4L, 17L, 14L, NA,
NA, 15L, 2L, 6L, 22L, 17L, 15L, NA, 19L), .Label = c("-9",
"52", "58", "60", "61", "62", "64", "65", "66", "67", "68",
"69", "70", "71", "72", "73", "74", "75", "76", "77", "78",
"79", "80", "81", "82", "83", "84", "85", "86", "88", "90",
"91", "94"), class = "factor"), aaoaae2 = structure(c(3L,
16L, 24L, 19L, 25L, NA, 15L, 13L, 18L, NA, 20L, 6L, 7L, NA,
20L, 13L, 29L, 15L, 4L, 12L, 10L, NA, 25L, NA, 5L, 17L, 16L,
21L, 14L, 29L, 26L, NA, 28L, 26L, 31L, 8L, 14L, NA, NA, NA,
9L, 28L, 17L, 22L, 7L, 18L, 10L, NA, 8L, NA, 19L, 33L, 24L,
NA, 11L, 6L, 22L, 25L, 20L, 12L, NA, NA, 10L, NA, 11L, 18L,
17L, 17L, 10L, 5L, 14L, 14L, 30L, 26L, NA, 27L, 32L, 20L,
23L, 25L, 16L, 24L, NA, 16L, 25L, 6L, 5L, 4L, 17L, 14L, NA,
NA, 15L, 2L, 6L, 22L, 17L, 15L, NA, 19L), .Label = c("-9",
"52", "58", "60", "61", "62", "64", "65", "66", "67", "68",
"69", "70", "71", "72", "73", "74", "75", "76", "77", "78",
"79", "80", "81", "82", "83", "84", "85", "86", "88", "90",
"91", "94"), class = "factor"), apoe_1 = structure(c(3L,
3L, 3L, 3L, 3L, NA, 3L, 3L, 3L, NA, 3L, 2L, 3L, NA, 3L, 2L,
3L, 4L, 4L, 3L, 3L, NA, 4L, 3L, 3L, 3L, 3L, 3L, 3L, NA, 2L,
NA, 3L, 3L, 2L, 4L, 4L, NA, NA, NA, 4L, 3L, 4L, 2L, NA, 3L,
4L, NA, 3L, NA, 4L, 3L, 2L, NA, 4L, 3L, 3L, 3L, 3L, 3L, 3L,
NA, 3L, NA, 4L, 3L, 3L, 3L, 3L, 4L, 3L, 3L, 3L, 3L, NA, 3L,
3L, 4L, 3L, 3L, 3L, 3L, NA, 3L, 3L, 3L, 3L, 3L, 3L, 3L, NA,
NA, 3L, 3L, 3L, 3L, 3L, 4L, NA, 4L), .Label = c("-9", "2",
"3", "4"), class = "factor"), apoe_2 = structure(c(4L, 4L,
3L, 3L, 3L, NA, 4L, 4L, 4L, NA, 3L, 3L, 3L, NA, 4L, 3L, 3L,
4L, 4L, 3L, 4L, NA, 2L, 4L, 3L, 4L, 3L, 4L, 3L, NA, 3L, NA,
3L, 3L, 3L, 4L, 2L, NA, NA, NA, 2L, 3L, 3L, 3L, NA, 3L, 3L,
NA, 3L, NA, 3L, 3L, 2L, NA, 3L, 4L, 4L, 4L, 3L, 4L, 4L, NA,
4L, NA, 2L, 3L, 3L, 3L, 4L, 3L, 2L, 4L, 4L, 3L, NA, 3L, 3L,
4L, 3L, 3L, 4L, 3L, NA, 3L, 3L, 2L, 3L, 2L, 3L, 4L, NA, NA,
2L, 4L, 3L, 4L, 2L, 3L, NA, 4L), .Label = c("-9", "2", "3",
"4"), class = "factor"), apoe4any = structure(c(3L, 3L, 2L,
2L, 2L, NA, 3L, 3L, 3L, NA, 2L, 2L, 2L, NA, 3L, 2L, 2L, 3L,
3L, 2L, 3L, NA, 3L, 3L, 2L, 3L, 2L, 3L, 2L, NA, 2L, NA, 2L,
2L, 2L, 3L, 3L, NA, NA, NA, 3L, 2L, 3L, 2L, NA, 2L, 3L, NA,
2L, NA, 3L, 2L, 2L, NA, 3L, 3L, 3L, 3L, 2L, 3L, 3L, NA, 3L,
NA, 3L, 2L, 2L, 2L, 3L, 3L, 2L, 3L, 3L, 2L, NA, 2L, 2L, 3L,
2L, 2L, 3L, 2L, NA, 2L, 2L, 2L, 2L, 2L, 2L, 3L, NA, NA, 2L,
3L, 2L, 3L, 2L, 3L, NA, 3L), .Label = c("-9", "0", "1"), class = "factor"),
apoe4dose = structure(c(3L, 3L, 2L, 2L, 2L, NA, 3L, 3L, 3L,
NA, 2L, 2L, 2L, NA, 3L, 2L, 2L, 4L, 4L, 2L, 3L, NA, 3L, 3L,
2L, 3L, 2L, 3L, 2L, NA, 2L, NA, 2L, 2L, 2L, 4L, 3L, NA, NA,
NA, 3L, 2L, 3L, 2L, NA, 2L, 3L, NA, 2L, NA, 3L, 2L, 2L, NA,
3L, 3L, 3L, 3L, 2L, 3L, 3L, NA, 3L, NA, 3L, 2L, 2L, 2L, 3L,
3L, 2L, 3L, 3L, 2L, NA, 2L, 2L, 4L, 2L, 2L, 3L, 2L, NA, 2L,
2L, 2L, 2L, 2L, 2L, 3L, NA, NA, 2L, 3L, 2L, 3L, 2L, 3L, NA,
4L), .Label = c("-9", "0", "1", "2"), class = "factor"),
Ethnicity = structure(c(`5068` = 4L, `15562` = 4L, `8939` = 4L,
`17602` = 3L, `3173` = 4L, `12591` = 4L, `17391` = 4L, `8241` = 4L,
`9746` = 4L, `9673` = 4L, `16594` = 4L, `16911` = 4L, `4796` = 4L,
`6598` = 4L, `11462` = 4L, `16425` = 4L, `12698` = 4L, `17118` = 4L,
`1501` = 4L, `13294` = 4L, `8072` = 4L, `11642` = 4L, `4164` = 1L,
`9035` = 4L, `16691` = 4L, `16002` = 4L, `3915` = 2L, `7409` = 4L,
`9759` = 4L, `6130` = 4L, `15153` = 4L, `13539` = 4L, `13262` = 4L,
`742` = 4L, `17592` = 3L, `16812` = 4L, `213` = 1L, `11963` = 4L,
`12093` = 4L, `11910` = 4L, `15813` = 4L, `1104` = 4L, `1966` = 4L,
`5589` = 1L, `8860` = 4L, `482` = 4L, `3967` = 4L, `5869` = 4L,
`12435` = 4L, `11675` = 4L, `16701` = 4L, `5893` = 4L, `16880` = 4L,
`13290` = 4L, `1097` = 4L, `1476` = 4L, `9100` = 4L, `6220` = 4L,
`15393` = 4L, `16631` = 4L, `9641` = 4L, `13485` = 4L, `1028` = 4L,
`8200` = 4L, `12190` = 4L, `5581` = 4L, `7266` = 4L, `12254` = 4L,
`15763` = 4L, `17764` = 3L, `16239` = 4L, `7548` = 4L, `12037` = 4L,
`7813` = 4L, `12943` = 4L, `17748` = 3L, `12703` = 4L, `11964` = 4L,
`14018` = 4L, `1769` = 4L, `13713` = 4L, `13100` = 4L, `13866` = 4L,
`2527` = 4L, `2281` = 2L, `4463` = 4L, `5815` = 4L, `14040` = 4L,
`16560` = 4L, `12887` = 4L, `11167` = 4L, `6123` = 4L, `5668` = 4L,
`3036` = 4L, `7622` = 4L, `11470` = 4L, `4770` = 2L, `17050` = 4L,
`6295` = 4L, `9575` = 4L), .Label = c("AA", "Asian", "Hispanic",
"NHW"), class = "factor")), row.names = c(NA, -100L), class = "data.frame")
We can do a group by 'status', 'Ethnicity' and get the sum of logical vector
library(dplyr)
df %>%
group_by(status, Ethnicity) %>%
summarise(n_75 = sum(as.numeric(as.character(age_at_onset)) < 75,
na.rm = TRUE),
n_70= sum(as.numeric(as.character(age_at_onset)) < 70,
na.rm = TRUE) )
-output
# A tibble: 10 x 4
# Groups: status [4]
# status Ethnicity n_75 n_70
# <fct> <fct> <int> <int>
# 1 " 1" Asian 0 0
# 2 " 1" Hispanic 0 0
# 3 " 1" NHW 0 0
# 4 " 2" Asian 1 0
# 5 " 2" Hispanic 1 1
# 6 " 2" NHW 18 9
# 7 " 3" AA 1 0
# 8 " 3" NHW 5 5
# 9 <NA> AA 1 0
#10 <NA> NHW 0 0
This is work:
> df %>% select(iid, age_at_onset, status, Ethnicity) %>%
+ mutate(LT75 = ifelse(as.numeric(as.character(df$age_at_onset)) < 75, 1,0), LT70 = ifelse(as.numeric(as.character(df$age_at_onset)) < 70, 1,0)) %>% group_by(status, Ethnicity) %>%
+ summarise(Lessthan75 = sum(LT75, na.rm = 1), Lessthan70 = sum(LT70, na.rm = 1))
`summarise()` regrouping output by 'status' (override with `.groups` argument)
# A tibble: 10 x 4
# Groups: status [4]
status Ethnicity Lessthan75 Lessthan70
<fct> <fct> <dbl> <dbl>
1 " 1" Asian 0 0
2 " 1" Hispanic 0 0
3 " 1" NHW 0 0
4 " 2" Asian 1 0
5 " 2" Hispanic 1 1
6 " 2" NHW 18 9
7 " 3" AA 1 0
8 " 3" NHW 5 5
9 NA AA 1 0
10 NA NHW 0 0
>
You have age_at_onset as factor, convert it to numeric, then use cut to divide data into different buckets and count to count how many iid fall into each bucket.
library(dplyr)
df %>%
mutate(age_at_onset = as.numeric(as.character(age_at_onset)),
age_group = cut(age_at_onset, c(-Inf, 70, 75, Inf))) %>%
count(Ethnicity, status, age_group)
# Ethnicity status age_group n
#1 AA 3 (70,75] 1
#2 AA 3 (75, Inf] 1
#3 AA <NA> (70,75] 1
#4 Asian 1 <NA> 2
#5 Asian 2 (70,75] 1
#6 Hispanic 1 <NA> 2
#7 Hispanic 2 (-Inf,70] 1
#8 Hispanic 2 (75, Inf] 1
#9 NHW 1 <NA> 29
#10 NHW 2 (-Inf,70] 10
#11 NHW 2 (70,75] 9
#12 NHW 2 (75, Inf] 13
#13 NHW 2 <NA> 2
#14 NHW 3 (-Inf,70] 5
#15 NHW 3 (75, Inf] 1
#16 NHW <NA> (75, Inf] 1
#17 NHW <NA> <NA> 20
If needed you can drop the NA values in age_group column with filter(!is.na(age_group)).
I have two data frames that I am trying to join using full_join, here is a subset of my data:
df1 <- structure(list(Team = structure(c(4L, 3L, 5L, 6L, 7L, 7L, 8L,
8L, 9L, 9L, 10L, 10L, 11L, 11L, 12L, 12L, 14L, 13L, 15L, 15L,
16L, 16L, 17L, 17L, 18L, 18L, 19L, 19L, 20L, 20L, 21L, 22L, 23L,
23L, 24L, 24L, 25L, 25L, 28L, 28L, 29L, 29L, 30L, 30L, 31L, 31L,
32L, 32L, 33L, 33L, 34L, 34L, 2L, 1L, 26L, 27L), .Label = c("76ers",
"76ers ", "Bucks", "Bucks ", "Bull ", "Bulls ", "Cavaliers ",
"Celtics ", "Clippers ", "Grizzlies ", "Hawks ", "Heat ", "Hornets",
"Hornets ", "Jazz ", "Kings ", "Knicks ", "Lakers ", "Magic ",
"Mavericks ", "Net ", "Nets ", "Nuggets ", "Pacers ", "Pelicans ",
"Pistons", "Pistons ", "Raptors ", "Rockets ", "Spurs ", "Thunder ",
"Timberwolves ", "Warriors ", "Wizards "), class = "factor"),
Injury.Count = c(3L, 3L, 1L, 1L, 1L, 2L, 0L, 2L, 1L, 1L,
0L, 2L, 1L, 0L, 5L, 4L, 3L, 2L, 3L, 0L, 3L, 3L, 4L, 6L, 5L,
0L, 2L, 2L, 1L, 2L, 0L, 1L, 3L, 4L, 2L, 6L, 2L, 1L, 1L, 1L,
3L, 3L, 4L, 5L, 1L, 6L, 4L, 2L, 0L, 2L, 2L, 1L, 5L, 6L, 1L,
1L), HomevsAway = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L), .Label = c("0", "1"), class = "factor")), row.names = c(NA,
-56L), class = "data.frame")
df2 <- structure(list(Team = structure(c(1L, 1L, 2L, 2L, 3L, 4L, 4L,
5L, 6L, 7L, 8L, 9L, 9L, 10L, 10L, 11L, 12L, 12L, 13L, 13L, 14L,
15L, 15L, 16L, 16L, 17L, 18L, 18L, 19L, 19L, 20L, 20L, 21L, 21L,
22L, 22L, 23L, 23L, 24L, 24L, 25L, 25L, 26L, 26L, 27L, 28L, 28L,
3L, 5L, 6L, 7L, 8L, 11L, 14L, 17L, 27L), .Label = c("76ers",
"Bucks", "Bulls", "Cavaliers", "Celtics", "Clippers", "Grizzlies",
"Hawks", "Heat", "Hornets", "Jazz", "Kings", "Knicks", "Lakers",
"Magic", "Mavericks", "Nets", "Nuggets", "Pacers", "Pelicans",
"Pistons", "Raptors", "Rockets", "Spurs", "Thunder", "Timberwolves",
"Warriors", "Wizards"), class = "factor"), HomevsAway = structure(c(1L,
2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L,
2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L,
1L, 1L, 2L, 2L, 2L, 1L, 1L), .Label = c("0", "1"), class = "factor"),
t_1 = c(55.883, 140.1, 32.2, 37.967, 29.85, 24.317, 57.316,
17.967, 19.05, 36.95, 16.167, 95.317, 86.533, 21.334, 52.567,
40.75, 28.3, 68.15, 97.067, 102.233, 26.866, 71.033, 34.467,
24.233, 42.033, 22.433, 59.033, 41.516, 12.7, 107.996, 6.5,
32.783, 0, 23.217, 13.93, 0, 54.88, 23.617, 83.834, 106.794,
17.56, 27.76, 85.83, 0.017, 35.183, 22.467, 25.033, 0, 0,
0, 0, 0, 0, 0, 0, 0), t_3 = c(197.3164, 388.6827, 126.2663,
111.916, 61.95, 91.55, 167.067, 104.083, 71.067, 135.383,
45.633, 261.317, 267.399, 114.6997, 159.2, 152.034, 84.8337,
204.3003, 351.449, 376.317, 86.333, 213.9, 99.767, 65.1,
131.767, 73.317, 126.416, 129.066, 73.383, 347.0994, 4761,
113.367, 0, 89.933, 59.8, 0, 188.983, 124.384, 215.666, 289.9667,
92, 144.2497, 254.083, 32.0333, 122.1837, 102.533, 82.817,
0, 0, 0, 0, 0, 0, 0, 0, 0)), row.names = c(NA, -56L), groups = structure(list(
Team = structure(1:28, .Label = c("76ers", "Bucks", "Bulls",
"Cavaliers", "Celtics", "Clippers", "Grizzlies", "Hawks",
"Heat", "Hornets", "Jazz", "Kings", "Knicks", "Lakers", "Magic",
"Mavericks", "Nets", "Nuggets", "Pacers", "Pelicans", "Pistons",
"Raptors", "Rockets", "Spurs", "Thunder", "Timberwolves",
"Warriors", "Wizards"), class = "factor"), .rows = structure(list(
1:2, 3:4, c(5L, 48L), 6:7, c(8L, 49L), c(9L, 50L), c(10L,
51L), c(11L, 52L), 12:13, 14:15, c(16L, 53L), 17:18,
19:20, c(21L, 54L), 22:23, 24:25, c(26L, 55L), 27:28,
29:30, 31:32, 33:34, 35:36, 37:38, 39:40, 41:42, 43:44,
c(45L, 56L), 46:47), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, 28L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
I have tried to join the data using full_join:
LR <- full_join(df1, df2, by = c("HomevsAway", "Team"))
The output of LR I am receiving is giving me extra rows with random NA's involved. My expected output should be a 56rowx5col table.
The problem is that in your first data.frame the Team names have a trailing space. This means that instead of the string "Bucks" you have the string "Bucks ".
These two strings cannot be joined.
Here is how you can fix your data. First delete leading spaces using sub and then transform the Team into a character vector. Then the full_join works as planned:
df1_new <- df1 %>% as_tibble() %>%
mutate(Team = sub(" +", "", as.character(Team)))
df2_new <- df2 %>%
mutate(Team = as.character(Team))
df1_new %>% full_join(df2_new, by = c("Team", "HomevsAway"))
# A tibble: 58 x 5
Team Injury.Count HomevsAway t_1 t_3
<chr> <int> <fct> <dbl> <dbl>
1 Bucks 3 0 32.2 126.
2 Bucks 3 1 38.0 112.
3 Bull 1 0 NA NA
4 Bulls 1 1 0 0
5 Cavaliers 1 0 24.3 91.6
6 Cavaliers 2 1 57.3 167.
7 Celtics 0 0 0 0
8 Celtics 2 1 18.0 104.
9 Clippers 1 0 0 0
10 Clippers 1 1 19.0 71.1
Note that there are still a few NAs. This is due to some typos: Bull vs Bulls and Net vs Nets.
consider the following example data:
ex = structure(list(group = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 1L,
2L, 3L, 4L, 6L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 6L, 1L,
2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 6L,
1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L,
5L, 6L, 1L, 2L, 1L, 2L, 3L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 5L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L, 1L, 1L, 1L,
4L), .Label = c("A", "B", "C", "D", "E", "F"), class = "factor"),
ID = structure(c(35L, 35L, 35L, 35L, 35L, 35L, 1L, 1L, 1L,
1L, 1L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 9L, 9L,
9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L,
11L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L,
13L, 13L, 14L, 14L, 14L, 14L, 14L, 14L, 21L, 21L, 22L, 22L,
22L, 22L, 2L, 3L, 4L, 5L, 8L, 15L, 16L, 17L, 18L, 19L, 19L,
20L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 30L, 31L, 32L,
33L, 34L), .Label = c("10", "107", "108", "109", "124", "17",
"18", "187", "19", "21", "24", "26", "27", "28", "335", "336",
"339", "340", "341", "342", "38", "39", "576", "577", "578",
"579", "580", "581", "582", "583", "584", "585", "586", "592",
"6"), class = "factor"), value = c(1L, 7L, 4L, 4L, 3L, 9L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 5L, 1L, 2L, 1L, 2L, 2L,
2L, 1L, 33L, 27L, 28L, 21L, 28L, 1L, 3L, 1L, 1L, 1L, 1L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 3L, 3L, 2L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L)), class = "data.frame",
row.names = c(NA, -88L), .Names = c("group", "ID", "value")
)
Note that in group A, value = 1 for every ID. I use ggplot2 to create dot plot based on counts of the value variable using geom_dotplot and faceting by group:
ggplot(ex) + aes(x = value) +
geom_dotplot(binwidth = 1, method = "histodot") +
facet_wrap(~ group)
The dot stack in the first facet is cut off, even when exported using ggsave. Changing the y-axis limits has no effect, but changing the aspect ratio so that H >= W seems to fix the issue (usually by adding way more space to the top than necessary). Is this a bug, or am I specifying my dot plot incorrectly?
EDIT
One workaround is to flip my dotplot and bin by the y variable:
ggplot(ex) + aes(x = group, y = value) +
geom_dotplot(binwidth = 1, method = "histodot",
binaxis = "y", stackdir = "centerwhole") +
facet_wrap(~ group, scales = "free_x")
Two other parameters that can help you are stackratio and dotsize. For example:
ggplot(ex) + aes(x = value) +
geom_dotplot(binwidth = 1, method = "histodot", stackratio = 0.9, dotsize = .75) +
facet_wrap(~ group) +
scale_y_continuous(NULL, breaks = NULL)
You would need to tweak the numbers until you got the layout you wanted.
I found an interesting workaround using geom_bar that achieves the same structure as a dot plot but with rectangles:
ggplot(ex) + aes(x = value, group = ID) +
geom_bar(color = "black", fill = "white", width = 1) +
facet_wrap(~ group)
Although it results in rectangles (rather than dots) and you can't control the stack spacing. The rectangles get resized according to the plot window, which would be equivalent to tweaking the dot size in geom_dotplot. Also, it begs the question "why not just use a regular bar plot?"
This question already has answers here:
Convert data from long format to wide format with multiple measure columns
(6 answers)
Closed 4 years ago.
I need to reorganize a large dataset into a specific format for further analysis. Right now the data are in long format, with multiple records through time for each point. I need to reshape the data so that each point has a single record, but it will add many new columns of the time-specific data. I’ve looked at previous similar posts but I need to ultimately convert several of the current variables into columns, and I can’t find an example of such. Is there a way to accomplish this in a single reshape, or will I have to do several and then concatenate the new columns back together? Another wrinkle before I post the example is that not all points were sampled at each time-step, so I need those values to show up as NA. For example, (see data below) SitePoint A1 was not sampled at all in 2012, SitePoint A10 was not sampled during the first round in 2012, but K83 was sampled all nine times.
mydatain <- structure(list(SitePoint = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 6L, 6L), .Label = c("A1", "A10", "K145", "K83", "T15",
"T213"), class = "factor"), Year_Rotation = structure(c(1L, 2L,
3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 8L, 9L, 1L, 2L, 4L, 5L,
6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 1L, 7L), .Label = c("2010_1", "2010_2",
"2010_3", "2011_1", "2011_2", "2011_3", "2012_1", "2012_2", "2012_3"
), class = "factor"), MR_Fire = structure(c(5L, 6L, 6L, 2L, 9L,
9L, 5L, 6L, 6L, 2L, 9L, 9L, 7L, 8L, 16L, 17L, 21L, 22L, 23L,
25L, 3L, 4L, 10L, 11L, 12L, 13L, 14L, 15L, 18L, 19L, 20L, 1L,
2L, 2L, 5L, 6L, 6L, 11L, 11L, 12L, 7L, 24L), .Label = c("0",
"1", "10", "11", "12", "13", "14", "15", "2", "23", "24", "25",
"35", "36", "37", "39", "40", "47", "48", "49", "51", "52", "53",
"8", "9"), class = "factor"), fire_seas = structure(c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 3L), .Label = c("dry", "fire", "wet"
), class = "factor"), OptTSF = c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L,
0L, 1L, 1L)), .Names = c("SitePoint", "Year_Rotation", "MR_Fire",
"fire_seas", "OptTSF"), row.names = c(31L, 32L, 33L, 34L, 35L,
36L, 67L, 68L, 69L, 70L, 71L, 72L, 73L, 74L, 10543L, 10544L,
10545L, 10546L, 10547L, 10548L, 10549L, 10550L, 14988L, 14989L,
14990L, 14991L, 14992L, 14993L, 14994L, 14995L, 14996L, 17370L,
17371L, 17372L, 17373L, 17374L, 17375L, 17376L, 17377L, 17378L,
19353L, 19354L), class = "data.frame")
Ultimately I need something like this:
myfinal <- structure(list(SitePoint = structure(1:6, .Label = c("A1", "A10",
"K145", "K83", "T15", "T213"), class = "factor"), MR_Fire_2010_1 = c(12L,
12L, 39L, 23L, 0L, 14L), MR_Fire_2010_2 = c(13L, 13L, 40L, 24L,
1L, NA), MR_Fire_2010_3 = c(13L, 13L, NA, 25L, 1L, NA), MR_Fire_2011_1 = c(1L,
1L, 51L, 35L, 12L, NA), MR_Fire_2011_2 = c(2L, 2L, 52L, 36L,
13L, NA), MR_Fire_2011_3 = c(2L, 2L, 53L, 37L, 13L, NA), MR_Fire_2012_1 = c(NA,
NA, 9L, 47L, 24L, 8L), MR_Fire_2012_2 = c(NA, 14L, 10L, 48L,
24L, NA), MR_Fire_2012_3 = c(NA, 15L, 11L, 49L, 25L, NA), season_2010_1 = structure(c(2L,
2L, 1L, 2L, 2L, 1L), .Label = c("dry", "fire"), class = "factor"),
season_2010_2 = structure(c(2L, 2L, 1L, 2L, 2L, NA), .Label = c("dry",
"fire"), class = "factor"), season_2010_3 = structure(c(1L,
1L, NA, 1L, 1L, NA), .Label = "fire", class = "factor"),
season_2011_1 = structure(c(2L, 2L, 1L, 2L, 2L, NA), .Label = c("dry",
"fire"), class = "factor"), season_2011_2 = structure(c(2L,
2L, 1L, 2L, 2L, NA), .Label = c("dry", "fire"), class = "factor"),
season_2011_3 = structure(c(2L, 2L, 1L, 2L, 2L, NA), .Label = c("dry",
"fire"), class = "factor"), season_2012_1 = structure(c(NA,
NA, 2L, 1L, 1L, 2L), .Label = c("fire", "wet"), class = "factor"),
season_2012_2 = structure(c(NA, 1L, 2L, 1L, 1L, NA), .Label = c("fire",
"wet"), class = "factor"), season_2012_3 = structure(c(NA,
1L, 2L, 1L, 1L, NA), .Label = c("fire", "wet"), class = "factor"),
OptTSF_2010_1 = c(1L, 1L, 0L, 1L, 1L, 1L), OptTSF_2010_2 = c(1L,
1L, 0L, 1L, 1L, NA), OptTSF_2010_3 = c(1L, 1L, NA, 1L, 1L,
NA), OptTSF_2011_1 = c(1L, 1L, 0L, 0L, 1L, NA), OptTSF_2011_2 = c(1L,
1L, 0L, 0L, 1L, NA), OptTSF_2011_3 = c(1L, 1L, 0L, 0L, 1L,
NA), OptTSF_2012_1 = c(NA, NA, 1L, 0L, 0L, 1L), OptTSF_2012_2 = c(NA,
1L, 1L, 0L, 0L, NA), OptTSF_2012_3 = c(NA, 1L, 1L, 0L, 0L,
NA)), .Names = c("SitePoint", "MR_Fire_2010_1", "MR_Fire_2010_2",
"MR_Fire_2010_3", "MR_Fire_2011_1", "MR_Fire_2011_2", "MR_Fire_2011_3",
"MR_Fire_2012_1", "MR_Fire_2012_2", "MR_Fire_2012_3", "season_2010_1",
"season_2010_2", "season_2010_3", "season_2011_1", "season_2011_2",
"season_2011_3", "season_2012_1", "season_2012_2", "season_2012_3",
"OptTSF_2010_1", "OptTSF_2010_2", "OptTSF_2010_3", "OptTSF_2011_1",
"OptTSF_2011_2", "OptTSF_2011_3", "OptTSF_2012_1", "OptTSF_2012_2",
"OptTSF_2012_3"), class = "data.frame", row.names = c(NA, -6L
))
The actual dataset is about 23656 records X 15 variables, so doing it by hand is likely to cause major headaches and potential for mistakes. Any help or suggestions are appreciated. If this has been answered elsewhere, apologies. I couldn’t find anything directly applicable; everything seemed to related to three columns and only one of those being extracted as new variables. Thanks.
SP
dcast from the devel version of data.table i.e., v1.9.5 can cast multiple columns simultaneously. It can be installed from here.
library(data.table) ## v1.9.5+
dcast(setDT(mydatain), SitePoint~Year_Rotation,
value.var=c('MR_Fire', 'fire_seas', 'OptTSF'))
You can use reshape to change the structure of your dataframe from long to wide using the following code:
reshape(mydatain,timevar="Year_Rotation",idvar="SitePoint",direction="wide")