I have fish count data and am trying to create a new dataframe using averages of the measurements based on conditions of two different columns. here is my data:
df <- structure(list(SITE = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L,
3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L,
2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L,
2L, 2L, 2L, 3L, 3L, 3L), .Label = c("1", "2", "3"), class = "factor"),
ZONE = structure(c(5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), .Label = c("CREST", "INNER_FLAT", "MID_FLAT",
"OUTER_FLAT", "SLOPE"), class = "factor"), C_TOTAL = c(44L,
7L, 20L, 14L, 0L, 4L, 2L, 3L, 1L, 8L, 28L, 24L, 31L, 12L,
33L, 6L, 16L, 33L, 75L, 21L, 60L, 81L, 37L, 89L, 21L, 35L,
71L, 5L, 2L, 0L, 0L, 10L, 23L, 0L, 5L, 11L, 3L, 1L, 5L, 0L,
0L, 8L, 7L, 6L, 42L), C_M2 = c(0.210465706, 0.029861994,
0.090324177, 0.066599319, 0, 0.022092452, 0.011750593, 0.015245519,
0.004710433, 0.033111594, 0.155094195, 0.110576495, 0.193659068,
0.059152822, 0.192379108, 0.047800772, 0.08917095, 0.141336411,
0.402538785, 0.130438337, 0.315206235, 0.460746849, 0.278643938,
0.467754275, 0.192830321, 0.119928472, 0.411502497, 0.015370489,
0.005150184, 0, 0, 0.034651441, 0.067824733, 0, 0.009805851,
0.034844309, 0.010614352, 0.004131048, 0.01850898, 0, 0,
0.029195413, 0.021409016, 0.030498145, 0.172406074), TRANS_A = c(209.0601875,
234.411677, 221.4246571, 210.2123593, 226.6158348, 181.0573136,
170.2041767, 196.7791332, 212.294701, 241.6072127, 180.5354478,
217.0443184, 160.0751279, 202.8643689, 171.536298, 125.5209863,
179.4306337, 233.485481, 186.3174499, 160.9956132, 190.3515643,
175.801528, 132.7859497, 190.2708425, 108.9040348, 291.8406241,
172.5384427, 325.2986863, 388.3356059, 303.1957479, 261.1574528,
288.5882879, 339.1093313, 239.1118021, 509.89965, 315.6899993,
282.6362022, 242.0693453, 270.1391425, 294.8864591, 321.2013381,
274.0156514, 326.9650539, 196.7332763, 243.6109069), SCARID_T = c(35L,
4L, 4L, 13L, 0L, 4L, 2L, 0L, 1L, 4L, 20L, 12L, 17L, 5L, 20L,
6L, 6L, 18L, 63L, 11L, 41L, 75L, 34L, 89L, 14L, 33L, 68L,
0L, 0L, 0L, 0L, 10L, 22L, 0L, 0L, 10L, 0L, 0L, 1L, 0L, 0L,
6L, 0L, 4L, 42L), ACAN_T = c(4L, 0L, 11L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 8L, 5L, 0L, 0L, 0L, 0L, 3L, 2L, 7L, 8L, 8L, 1L,
1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 5L, 0L, 0L,
0L, 1L, 0L, 0L, 2L, 0L, 0L, 0L), SIG_T = c(5L, 3L, 5L, 1L,
0L, 0L, 0L, 3L, 0L, 4L, 0L, 7L, 14L, 7L, 13L, 0L, 7L, 13L,
5L, 2L, 11L, 5L, 2L, 0L, 7L, 1L, 3L, 5L, 2L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 3L, 1L, 3L, 0L, 0L, 0L, 7L, 2L, 0L)), row.names = c(NA,
-45L), class = "data.frame")
I want to average all the measurements by each zone, but also according to site. So I want anew data frame where each site has one measurement for each zone.
Can anyone help me? Thanks!
library(dplyr)
df %>%
group_by(SITE, ZONE) %>%
summarise(
across(where(is.numeric), mean)
)
# A tibble: 15 x 8
# Groups: SITE [3]
SITE ZONE C_TOTAL C_M2 TRANS_A SCARID_T ACAN_T SIG_T
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 CREST 20 0.0996 213. 12 4.33 3.67
2 1 INNER_FLAT 3 0.0111 265. 0.333 0.333 2.33
3 1 MID_FLAT 2.33 0.00684 339. 0 0 2.33
4 1 OUTER_FLAT 52 0.283 179. 38.3 7.67 6
5 1 SLOPE 23.7 0.110 222. 14.3 5 4.33
6 2 CREST 25.3 0.148 178. 14 0 11.3
7 2 INNER_FLAT 2.67 0.00973 297. 2 0.667 0
8 2 MID_FLAT 11 0.0342 296. 10.7 0.333 0
9 2 OUTER_FLAT 69 0.402 166. 66 0.667 2.33
10 2 SLOPE 6 0.0296 206. 5.67 0 0.333
11 3 CREST 18.3 0.0928 179. 10 1.67 6.67
12 3 INNER_FLAT 18.3 0.0748 256. 15.3 0 3
13 3 MID_FLAT 5.33 0.0149 355. 3.33 1.67 0.333
14 3 OUTER_FLAT 42.3 0.241 191. 38.3 0.333 3.67
15 3 SLOPE 2 0.0106 193. 1 0 1
I am relatively new to R; and, I need help with a user defined function. I would like to see where each observation of a data frame ranks in a subset of similar observations of the same data frame. I'm having trouble referencing the original observation, in order to extract its rank, within my function.
Here is a sample of my data:
> dput(df)
structure(list(Name = c("Alex Abrines", "Steven Adams", "Cole Aldrich",
"LaMarcus Aldridge", "Kyle Anderson", "Ryan Anderson", "Giannis Antetokounmpo",
"Carmelo Anthony", "OG Anunoby", "Darrell Arthur", "Will Barton",
"Bradley Beal", "Davis Bertans", "Nemanja Bjelica", "Malcolm Brogdon",
"Aaron Brooks", "Dillon Brooks", "Lorenzo Brown", "Sterling Brown",
"Reggie Bullock", "Jimmy Butler", "Dwight Buycks", "Clint Capela",
"Wilson Chandler", "Torrey Craig", "Jamal Crawford", "Deyonta Davis",
"Matthew Dellavedova", "DeMar DeRozan", "Gorgui Dieng", "Andre Drummond",
"James Ennis", "Kenneth Faried", "Raymond Felton", "Terrance Ferguson",
"Bryn Forbes", "Tim Frazier", "Langston Galloway", "Marc Gasol",
"Pau Gasol", "Paul George", "Marcus Georges-Hunt", "Taj Gibson",
"Manu Ginobili", "Marcin Gortat", "Jerami Grant", "Danny Green",
"Gerald Green", "JaMychal Green", "Blake Griffin", "James Harden",
"Gary Harris", "Andrew Harrison", "Myke Henry", "John Henson",
"Nene Hilario", "Darrun Hilliard", "Josh Huestis", "Serge Ibaka",
"Stanley Johnson", "Nikola Jokic", "Tyus Jones", "Luke Kennard",
"Sean Kilpatrick", "Joffrey Lauvergne", "Kyle Lowry", "Trey Lyles",
"Ian Mahinmi", "Thon Maker", "Jarell Martin", "Luc Mbah a Moute",
"Ben McLemore", "Jodie Meeks", "Khris Middleton", "Patty Mills",
"Eric Moreland", "Markieff Morris", "Emmanuel Mudiay", "Shabazz Muhammad",
"Xavier Munford", "Dejounte Murray", "Jamal Murray", "Lucas Nogueira",
"Kelly Oubre", "Tony Parker", "Patrick Patterson", "Brandon Paul",
"Chris Paul", "Marshall Plumlee", "Jakob Poeltl", "Otto Porter",
"Norman Powell", "Willie Reed", "Tomas Satoransky", "Mike Scott",
"Wayne Selden", "Pascal Siakam", "Ish Smith", "Tony Snell", "Jeff Teague",
"Anthony Tolliver", "Karl-Anthony Towns", "P.J. Tucker", "Jonas Valanciunas",
"Rashad Vaughn", "Russell Westbrook", "Andrew Wiggins", "D.J. Wilson",
"Delon Wright"), Pos = structure(c(5L, 1L, 1L, 1L, 3L, 2L, 3L,
2L, 2L, 2L, 4L, 4L, 2L, 2L, 4L, 4L, 5L, 4L, 4L, 5L, 3L, 4L, 1L,
2L, 5L, 4L, 1L, 4L, 5L, 1L, 1L, 2L, 2L, 4L, 5L, 4L, 4L, 4L, 1L,
1L, 2L, 4L, 2L, 4L, 1L, 2L, 5L, 5L, 2L, 2L, 4L, 4L, 4L, 2L, 1L,
1L, 4L, 2L, 1L, 2L, 1L, 4L, 4L, 4L, 1L, 4L, 2L, 1L, 1L, 2L, 2L,
4L, 4L, 3L, 4L, 1L, 2L, 4L, 3L, 4L, 4L, 4L, 1L, 2L, 4L, 2L, 4L,
4L, 1L, 1L, 2L, 4L, 1L, 4L, 2L, 5L, 2L, 4L, 5L, 4L, 1L, 1L, 2L,
1L, 4L, 4L, 3L, 2L, 4L), .Label = c("C", "PF", "SF", "PG", "SG"
), class = "factor"), Date = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "2018-02-01 *", class = "factor"),
Tm = structure(c(7L, 7L, 6L, 8L, 8L, 3L, 5L, 7L, 9L, 1L,
1L, 10L, 8L, 6L, 5L, 6L, 4L, 9L, 5L, 2L, 6L, 2L, 3L, 1L,
1L, 6L, 4L, 5L, 9L, 6L, 2L, 4L, 1L, 7L, 7L, 8L, 10L, 2L,
4L, 8L, 7L, 6L, 6L, 8L, 10L, 7L, 8L, 3L, 4L, 2L, 3L, 1L,
4L, 4L, 5L, 3L, 8L, 7L, 9L, 2L, 1L, 6L, 2L, 5L, 8L, 9L, 1L,
10L, 5L, 4L, 3L, 4L, 10L, 5L, 8L, 2L, 10L, 1L, 6L, 5L, 8L,
1L, 9L, 10L, 8L, 7L, 8L, 3L, 5L, 9L, 10L, 9L, 2L, 10L, 10L,
4L, 9L, 2L, 5L, 6L, 2L, 6L, 3L, 9L, 5L, 7L, 6L, 5L, 9L), .Label = c("DEN",
"DET", "HOU", "MEM", "MIL", "MIN", "OKC", "SAS", "TOR", "WAS"
), class = "factor"), Opp = structure(c(1L, 1L, 5L, 3L, 3L,
8L, 6L, 1L, 10L, 7L, 7L, 9L, 3L, 5L, 6L, 5L, 2L, 10L, 6L,
4L, 5L, 4L, 8L, 7L, 7L, 5L, 2L, 6L, 10L, 5L, 4L, 2L, 7L,
1L, 1L, 3L, 9L, 4L, 2L, 3L, 1L, 5L, 5L, 3L, 9L, 1L, 3L, 8L,
2L, 4L, 8L, 7L, 2L, 2L, 6L, 8L, 3L, 1L, 10L, 4L, 7L, 5L,
4L, 6L, 3L, 10L, 7L, 9L, 6L, 2L, 8L, 2L, 9L, 6L, 3L, 4L,
9L, 7L, 5L, 6L, 3L, 7L, 10L, 9L, 3L, 1L, 3L, 8L, 6L, 10L,
9L, 10L, 4L, 9L, 9L, 2L, 10L, 4L, 6L, 5L, 4L, 5L, 8L, 10L,
6L, 1L, 5L, 6L, 10L), .Label = c("DEN", "DET", "HOU", "MEM",
"MIL", "MIN", "OKC", "SAS", "TOR", "WAS"), class = "factor"),
MP = c(29L, 32L, 3L, 34L, 30L, 29L, 36L, 34L, 21L, 1L, 36L,
38L, 13L, 14L, 10L, 3L, 32L, 11L, 24L, 35L, 40L, 19L, 35L,
34L, 22L, 17L, 15L, 25L, 38L, 13L, 28L, 15L, 10L, 14L, 4L,
18L, 17L, 4L, 33L, 20L, 36L, 6L, 33L, 20L, 26L, 25L, 28L,
30L, 20L, 35L, 37L, 38L, 34L, 22L, 32L, 13L, 8L, 12L, 35L,
36L, 37L, 17L, 21L, 18L, 2L, 35L, 15L, 19L, 13L, 28L, 35L,
10L, 9L, 35L, 24L, 5L, 32L, 14L, 3L, 7L, 24L, 34L, 3L, 23L,
17L, 15L, 2L, 30L, 5L, 16L, 29L, 26L, 5L, 28L, 19L, 31L,
13L, 29L, 29L, 28L, 22L, 33L, 31L, 29L, 4L, 39L, 30L, 4L,
13L), Player.ID = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L,
20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L,
32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 42L, 41L, 43L,
44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 53L, 52L, 54L, 55L,
56L, 57L, 58L, 59L, 60L, 61L, 62L, 63L, 64L, 65L, 66L, 67L,
68L, 69L, 70L, 71L, 72L, 73L, 74L, 75L, 76L, 77L, 78L, 79L,
80L, 81L, 82L, 83L, 84L, 85L, 86L, 87L, 88L, 89L, 90L, 91L,
92L, 93L, 94L, 95L, 96L, 97L, 98L, 99L, 100L, 101L, 102L,
103L, 104L, 105L, 106L, 107L, 108L, 109L), .Label = c("abrinal01",
"adamsst01", "aldrico01", "aldrila01", "anderky01", "anderry01",
"antetgi01", "anthoca01", "anunoog01", "arthuda01", "bartowi01",
"bealbr01", "bertada01", "bjeline01", "brogdma01", "brookaa01",
"brookdi01", "brownlo01", "brownst02", "bullore01", "butleji01",
"buyckdw01", "capelca01", "chandwi01", "craigto01", "crawfja01",
"davisde01", "dellama01", "derozde01", "dienggo01", "drumman01",
"ennisja01", "farieke01", "feltora01", "fergute01", "forbebr01",
"fraziti01", "gallola01", "gasolma01", "gasolpa01", "georgma01",
"georgpa01", "gibsota01", "ginobma01", "gortama01", "grantje01",
"greenda02", "greenge01", "greenja01", "griffbl01", "hardeja01",
"harrian01", "harriga01", "henrymy01", "hensojo01", "hilarne01",
"hillida01", "huestjo01", "ibakase01", "johnsst04", "jokicni01",
"jonesty01", "kennalu01", "kilpase01", "lauvejo01", "lowryky01",
"lylestr01", "mahinia01", "makerth01", "martija01", "mbahalu01",
"mclembe01", "meeksjo01", "middlkh01", "millspa02", "moreler01",
"morrima02", "mudiaem01", "muhamsh01", "munfoxa02", "murrade01",
"murraja01", "noguelu01", "oubreke01", "parketo01", "pattepa01",
"paulbr01", "paulch01", "plumlma02", "poeltja01", "porteot01",
"powelno01", "reedwi02", "satorto01", "scottmi01", "seldewa01",
"siakapa01", "smithis01", "snellto01", "teaguje01", "tollian01",
"townska01", "tuckepj01", "valanjo01", "vaughra01", "westbru01",
"wiggian01", "wilsodj01", "wrighde01"), class = "factor"),
Game.ID = structure(c(7L, 7L, 6L, 8L, 8L, 3L, 5L, 7L, 9L,
1L, 1L, 10L, 8L, 6L, 5L, 6L, 4L, 9L, 5L, 2L, 6L, 2L, 3L,
1L, 1L, 6L, 4L, 5L, 9L, 6L, 2L, 4L, 1L, 7L, 7L, 8L, 10L,
2L, 4L, 8L, 7L, 6L, 6L, 8L, 10L, 7L, 8L, 3L, 4L, 2L, 3L,
1L, 4L, 4L, 5L, 3L, 8L, 7L, 9L, 2L, 1L, 6L, 2L, 5L, 8L, 9L,
1L, 10L, 5L, 4L, 3L, 4L, 10L, 5L, 8L, 2L, 10L, 1L, 6L, 5L,
8L, 1L, 9L, 10L, 8L, 7L, 8L, 3L, 5L, 9L, 10L, 9L, 2L, 10L,
10L, 4L, 9L, 2L, 5L, 6L, 2L, 6L, 3L, 9L, 5L, 7L, 6L, 5L,
9L), .Label = c("2018-02-01 * DEN", "2018-02-01 * DET", "2018-02-01 * HOU",
"2018-02-01 * MEM", "2018-02-01 * MIL", "2018-02-01 * MIN",
"2018-02-01 * OKC", "2018-02-01 * SAS", "2018-02-01 * TOR",
"2018-02-01 * WAS"), class = "factor")), .Names = c("Name",
"Pos", "Date", "Tm", "Opp", "MP", "Player.ID", "Game.ID"), class = "data.frame", row.names = c(NA,
109L))
I would like to write a function that, for each observation:
> df[1, ]
Name Pos Date Tm Opp MP Player.ID Game.ID
1 Alex Abrines SG 2018-02-01 * OKC DEN 29 abrinal01 2018-02-01 * OKC
creates a subset of all other observations with a matching df$Game.ID.
> df[df$Game.ID == '2018-02-01 * OKC', ]
Name Pos Date Tm Opp MP Player.ID Game.ID
1 Alex Abrines SG 2018-02-01 * OKC DEN 29 abrinal01 2018-02-01 * OKC
2 Steven Adams C 2018-02-01 * OKC DEN 32 adamsst01 2018-02-01 * OKC
8 Carmelo Anthony PF 2018-02-01 * OKC DEN 34 anthoca01 2018-02-01 * OKC
34 Raymond Felton PG 2018-02-01 * OKC DEN 14 feltora01 2018-02-01 * OKC
35 Terrance Ferguson SG 2018-02-01 * OKC DEN 4 fergute01 2018-02-01 * OKC
41 Paul George PF 2018-02-01 * OKC DEN 36 georgpa01 2018-02-01 * OKC
46 Jerami Grant PF 2018-02-01 * OKC DEN 25 grantje01 2018-02-01 * OKC
58 Josh Huestis PF 2018-02-01 * OKC DEN 12 huestjo01 2018-02-01 * OKC
86 Patrick Patterson PF 2018-02-01 * OKC DEN 15 pattepa01 2018-02-01 * OKC
106 Russell Westbrook PG 2018-02-01 * OKC DEN 39 westbru01 2018-02-01 * OKC
and then returns the rank of the original observation's df$MP
> df[1, c('MP')]
[1] 29
in the hierarchy of the new subset.
> xx <- data.frame(cbind(sort(df[df$Game.ID == '2018-02-01 * OKC', c('MP')], decreasing = TRUE), rownames(data.table(sort(df[df$Game.ID == '2018-02-01 * OKC', c('MP')], decreasing = TRUE)))))
> xx
X1 X2
1 39 1
2 36 2
3 34 3
4 32 4
5 29 5
6 25 6
7 15 7
8 14 8
9 12 9
10 4 10
> colnames(xx) <- c('MP', 'Depth.Chart')
> yy <- df[df$Game.ID == '2018-02-01 * OKC', ]
> yy
Name Pos Date Tm Opp MP Player.ID
1 Alex Abrines SG 2018-02-01 * OKC DEN 29 abrinal01
2 Steven Adams C 2018-02-01 * OKC DEN 32 adamsst01
8 Carmelo Anthony PF 2018-02-01 * OKC DEN 34 anthoca01
34 Raymond Felton PG 2018-02-01 * OKC DEN 14 feltora01
35 Terrance Ferguson SG 2018-02-01 * OKC DEN 4 fergute01
41 Paul George PF 2018-02-01 * OKC DEN 36 georgpa01
46 Jerami Grant PF 2018-02-01 * OKC DEN 25 grantje01
58 Josh Huestis PF 2018-02-01 * OKC DEN 12 huestjo01
86 Patrick Patterson PF 2018-02-01 * OKC DEN 15 pattepa01
106 Russell Westbrook PG 2018-02-01 * OKC DEN 39 westbru01
Game.ID
1 2018-02-01 * OKC
2 2018-02-01 * OKC
8 2018-02-01 * OKC
34 2018-02-01 * OKC
35 2018-02-01 * OKC
41 2018-02-01 * OKC
46 2018-02-01 * OKC
58 2018-02-01 * OKC
86 2018-02-01 * OKC
106 2018-02-01 * OKC
> zz <- merge(yy, xx, all.x = TRUE)
> zz
MP Name Pos Date Tm Opp Player.ID
1 4 Terrance Ferguson SG 2018-02-01 * OKC DEN fergute01
2 12 Josh Huestis PF 2018-02-01 * OKC DEN huestjo01
3 14 Raymond Felton PG 2018-02-01 * OKC DEN feltora01
4 15 Patrick Patterson PF 2018-02-01 * OKC DEN pattepa01
5 25 Jerami Grant PF 2018-02-01 * OKC DEN grantje01
6 29 Alex Abrines SG 2018-02-01 * OKC DEN abrinal01
7 32 Steven Adams C 2018-02-01 * OKC DEN adamsst01
8 34 Carmelo Anthony PF 2018-02-01 * OKC DEN anthoca01
9 36 Paul George PF 2018-02-01 * OKC DEN georgpa01
10 39 Russell Westbrook PG 2018-02-01 * OKC DEN westbru01
Game.ID Depth.Chart
1 2018-02-01 * OKC 10
2 2018-02-01 * OKC 9
3 2018-02-01 * OKC 8
4 2018-02-01 * OKC 7
5 2018-02-01 * OKC 6
6 2018-02-01 * OKC 5
7 2018-02-01 * OKC 4
8 2018-02-01 * OKC 3
9 2018-02-01 * OKC 2
10 2018-02-01 * OKC 1
Finally, I need to extract the value of zz$Depth.Chart that corresponds to the original observation, 5.
> zz[zz$MP == 29, c('Depth.Chart')]
[1] 5
Levels: 1 10 2 3 4 5 6 7 8 9
I would like to define a function that executes the laborious and messy steps above for each observation in a data frame and returns a vector of the results. How can I reference the value of df$MP that corresponds to the observation I'm working on without explicitly calling it 29, like I do above? Here are a few of the thing I've tried, unsuccessfully.
> f1 <- function(col1, df, col2){
+ lapply(col1, function(i){
+ df2 <- df[col1 == i, col2]
+ df3 <- data.frame(cbind(sort(df2, decreasing = TRUE), rownames(data.table(sort(df2, decreasing = TRUE)))))
+ df3[i, 2]
+ })}
> f1(df$Game.ID, df, c('MP'))[1:10]
[[1]]
[1] 7
Levels: 1 10 2 3 4 5 6 7 8 9
[[2]]
[1] 7
Levels: 1 10 2 3 4 5 6 7 8 9
[[3]]
[1] 6
Levels: 1 10 11 12 13 2 3 4 5 6 7 8 9
[[4]]
[1] 8
Levels: 1 10 11 12 13 2 3 4 5 6 7 8 9
[[5]]
[1] 8
Levels: 1 10 11 12 13 2 3 4 5 6 7 8 9
[[6]]
[1] 3
Levels: 1 2 3 4 5 6 7 8
[[7]]
[1] 5
Levels: 1 10 11 12 13 2 3 4 5 6 7 8 9
[[8]]
[1] 7
Levels: 1 10 2 3 4 5 6 7 8 9
[[9]]
[1] 9
Levels: 1 10 11 2 3 4 5 6 7 8 9
[[10]]
[1] 1
Levels: 1 10 2 3 4 5 6 7 8 9
> f1 <- function(col1, df, col2){
+ lapply(col1, function(i){
+ df2 <- df[col1 == i, col2]
+ df3 <- data.frame(cbind(sort(df2, decreasing = TRUE), rownames(data.table(sort(df2, decreasing = TRUE)))))
+ df3[df3$X1 == i, 2]
+ })}
> f1(df$Game.ID, df, c('MP'))
Hide Traceback
Rerun with Debug
Error in Ops.factor(df3$X1, i) : level sets of factors are different
7.
stop("level sets of factors are different")
6.
Ops.factor(df3$X1, i)
5.
`[.data.frame`(df3, df3$X1 == i, 2)
4.
df3[df3$X1 == i, 2]
3.
FUN(X[[i]], ...)
2.
lapply(col1, function(i) {
df2 <- df[col1 == i, col2]
df3 <- data.frame(cbind(sort(df2, decreasing = TRUE), rownames(data.table(sort(df2,
decreasing = TRUE))))) ...
1.
f1(df$Game.ID, df, c("MP"))
> f1 <- function(col1, df, col2){
+ lapply(col1, function(i){
+ df2 <- df[col1 == i, col2]
+ df3 <- data.frame(cbind(sort(df2, decreasing = TRUE), rownames(data.table(sort(df2, decreasing = TRUE)))))
+ df3[col2 == i, 2]
+ })}
> f1(df$Game.ID, df, c('MP'))[1:10]
[[1]]
factor(0)
Levels: 1 10 2 3 4 5 6 7 8 9
[[2]]
factor(0)
Levels: 1 10 2 3 4 5 6 7 8 9
[[3]]
factor(0)
Levels: 1 10 11 12 13 2 3 4 5 6 7 8 9
[[4]]
factor(0)
Levels: 1 10 11 12 13 2 3 4 5 6 7 8 9
[[5]]
factor(0)
Levels: 1 10 11 12 13 2 3 4 5 6 7 8 9
[[6]]
factor(0)
Levels: 1 2 3 4 5 6 7 8
[[7]]
factor(0)
Levels: 1 10 11 12 13 2 3 4 5 6 7 8 9
[[8]]
factor(0)
Levels: 1 10 2 3 4 5 6 7 8 9
[[9]]
factor(0)
Levels: 1 10 11 2 3 4 5 6 7 8 9
[[10]]
factor(0)
Levels: 1 10 2 3 4 5 6 7 8 9
I guess I don't fully understand how R treats this i variable inside the function; or, therefore, how reference it appropriately. In looking through this forum, I found generic examples of nesting functions inside of functions in Python but not in R. Any help would be much appreciated.
EDIT
Here is a simpler subset of my data:
> dput(df)
structure(list(MP = c(29L, 32L, 3L, 34L, 14L, 3L, 40L, 17L, 13L,
14L, 4L, 36L, 6L, 33L, 25L, 12L, 17L, 3L, 15L, 28L, 33L, 39L,
30L), Player.ID = structure(c(1L, 2L, 3L, 8L, 14L, 16L, 21L,
26L, 30L, 34L, 35L, 42L, 41L, 43L, 46L, 58L, 62L, 79L, 86L, 100L,
102L, 106L, 107L), .Label = c("abrinal01", "adamsst01", "aldrico01",
"aldrila01", "anderky01", "anderry01", "antetgi01", "anthoca01",
"anunoog01", "arthuda01", "bartowi01", "bealbr01", "bertada01",
"bjeline01", "brogdma01", "brookaa01", "brookdi01", "brownlo01",
"brownst02", "bullore01", "butleji01", "buyckdw01", "capelca01",
"chandwi01", "craigto01", "crawfja01", "davisde01", "dellama01",
"derozde01", "dienggo01", "drumman01", "ennisja01", "farieke01",
"feltora01", "fergute01", "forbebr01", "fraziti01", "gallola01",
"gasolma01", "gasolpa01", "georgma01", "georgpa01", "gibsota01",
"ginobma01", "gortama01", "grantje01", "greenda02", "greenge01",
"greenja01", "griffbl01", "hardeja01", "harrian01", "harriga01",
"henrymy01", "hensojo01", "hilarne01", "hillida01", "huestjo01",
"ibakase01", "johnsst04", "jokicni01", "jonesty01", "kennalu01",
"kilpase01", "lauvejo01", "lowryky01", "lylestr01", "mahinia01",
"makerth01", "martija01", "mbahalu01", "mclembe01", "meeksjo01",
"middlkh01", "millspa02", "moreler01", "morrima02", "mudiaem01",
"muhamsh01", "munfoxa02", "murrade01", "murraja01", "noguelu01",
"oubreke01", "parketo01", "pattepa01", "paulbr01", "paulch01",
"plumlma02", "poeltja01", "porteot01", "powelno01", "reedwi02",
"satorto01", "scottmi01", "seldewa01", "siakapa01", "smithis01",
"snellto01", "teaguje01", "tollian01", "townska01", "tuckepj01",
"valanjo01", "vaughra01", "westbru01", "wiggian01", "wilsodj01",
"wrighde01"), class = "factor"), Game.ID = structure(c(7L, 7L,
6L, 7L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 6L, 6L, 7L, 7L, 6L, 6L,
7L, 6L, 6L, 7L, 6L), .Label = c("2018-02-01 * DEN", "2018-02-01 * DET",
"2018-02-01 * HOU", "2018-02-01 * MEM", "2018-02-01 * MIL", "2018-02-01 * MIN",
"2018-02-01 * OKC", "2018-02-01 * SAS", "2018-02-01 * TOR", "2018-02-01 * WAS"
), class = "factor")), .Names = c("MP", "Player.ID", "Game.ID"
), row.names = c(1L, 2L, 3L, 8L, 14L, 16L, 21L, 26L, 30L, 34L,
35L, 41L, 42L, 43L, 46L, 58L, 62L, 79L, 86L, 100L, 102L, 106L,
107L), class = "data.frame")
You're using data.table for little steps in your process, but you should just use it for the whole thing. It's very convenient for doing operations "by group", in this case using rank() by Game.ID. Using your small sample data:
library(data.table)
setDT(df)
df[, Depth.Chart := rank(-MP), by = Game.ID]
df
# MP Player.ID Game.ID Depth.Chart
# 1: 29 abrinal01 2018-02-01 * OKC 5.0
# 2: 32 adamsst01 2018-02-01 * OKC 4.0
# 3: 3 aldrico01 2018-02-01 * MIN 12.0
# 4: 34 anthoca01 2018-02-01 * OKC 3.0
# 5: 14 bjeline01 2018-02-01 * MIN 8.0
# 6: 3 brookaa01 2018-02-01 * MIN 12.0
# 7: 40 butleji01 2018-02-01 * MIN 1.0
# 8: 17 crawfja01 2018-02-01 * MIN 6.5
# 9: 13 dienggo01 2018-02-01 * MIN 9.0
# 10: 14 feltora01 2018-02-01 * OKC 8.0
# 11: 4 fergute01 2018-02-01 * OKC 10.0
# 12: 36 georgpa01 2018-02-01 * OKC 2.0
# 13: 6 georgma01 2018-02-01 * MIN 10.0
# 14: 33 gibsota01 2018-02-01 * MIN 2.5
# 15: 25 grantje01 2018-02-01 * OKC 6.0
# 16: 12 huestjo01 2018-02-01 * OKC 9.0
# 17: 17 jonesty01 2018-02-01 * MIN 6.5
# 18: 3 muhamsh01 2018-02-01 * MIN 12.0
# 19: 15 pattepa01 2018-02-01 * OKC 7.0
# 20: 28 teaguje01 2018-02-01 * MIN 5.0
# 21: 33 townska01 2018-02-01 * MIN 2.5
# 22: 39 westbru01 2018-02-01 * OKC 1.0
# 23: 30 wiggian01 2018-02-01 * MIN 4.0
# MP Player.ID Game.ID Depth.Chart
rank, by default, averages ties, but see ?rank for other options.
I've made a few experiments and each experiment led to the apparition of color.
As I can't do more experiments, I want to sample by size=30 and see what frequency table (of colors) I could obtain for 1000 sampling. The resulting frequency table should be the sum of the 1000 frequency table.
I think about concatenating table as follows and try to agregate, but it did not work:
mydata=structure(list(Date = structure(c(11L, 1L, 9L, 9L, 10L, 1L, 2L,
3L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 6L, 7L, 4L, 4L, 4L, 6L, 6L, 11L,
5L, 4L, 7L, 10L, 6L, 6L, 2L, 5L, 7L, 11L, 1L, 9L, 11L, 11L, 11L,
1L, 1L), .Label = c("01/02/2016", "02/02/2016", "03/02/2016",
"08/02/2016", "10/02/2016", "11/02/2016", "16/02/2016", "22/02/2016",
"26/01/2016", "27/01/2016", "28/01/2016"), class = "factor"),
Color = structure(c(30L, 33L, 11L, 1L, 18L, 18L, 11L,
16L, 19L, 19L, 22L, 1L, 18L, 18L, 13L, 14L, 13L, 18L, 24L,
24L, 11L, 24L, 2L, 33L, 25L, 1L, 30L, 5L, 24L, 18L, 13L,
35L, 19L, 19L, 18L, 23L, 19L, 8L, 19L, 14L), .Label = c("ARD",
"ARP", "BBB", "BIE", "CFX", "CHR", "DDD", "DOO", "EAU", "ELY",
"EPI", "ETR", "GEN", "GER", "GGG", "GIS", "ISE", "JUV", "LER",
"LES", "LON", "LYR", "MON", "NER", "NGY", "NOJ", "NYO", "ORI",
"PEO", "RAY", "RRR", "RSI", "SEI", "SEP", "VIL", "XQU", "YYY",
"ZYZ"), class = "factor"), Categorie = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1", "1,2", "1,2,3",
"1,3", "2", "2,3", "3", "4", "5"), class = "factor"), Portion_Longueur = c(3L,
4L, 1L, 1L, 2L, 4L, 5L, 6L, 7L, 7L, 8L, 8L, 9L, 8L, 8L, 9L,
11L, 7L, 7L, 7L, 9L, 8L, 3L, 8L, 7L, 11L, 2L, 9L, 8L, 5L,
8L, 12L, 3L, 4L, 1L, 3L, 3L, 3L, 4L, 5L)), .Names = c("Date",
"Color", "Categorie", "Portion_Longueur"), row.names = c(NA,
40L), class = "data.frame")
for (i in 1:1000) {
mysamp= sample(mydata$Color,size=30)
x=data.frame(table(mysamp))
if (i==1) w=x
else w <- c(w, x)
}
aggregate(w$Freq, by=list(Color=w$mysamp), FUN=sum)
Example, for 3 sampling, for (i in 1:3) I expect have sum as follow :
But I do not have Sum, instead I have:
Color x
1 ARD 2
2 ARP 1
3 BBB 0
4 BIE 0
5 CFX 0
6 CHR 0
7 DDD 0
8 DOO 1
9 EAU 0
10 ELY 0
11 EPI 3
12 ETR 0
13 GEN 2
14 GER 2
15 GGG 0
16 GIS 1
17 ISE 0
18 JUV 4
19 LER 5
20 LES 0
21 LON 0
22 LYR 1
23 MON 1
24 NER 2
25 NGY 1
26 NOJ 0
27 NYO 0
28 ORI 0
29 PEO 0
30 RAY 1
31 RRR 0
32 RSI 0
33 SEI 2
34 SEP 0
35 VIL 1
36 XQU 0
37 YYY 0
38 ZYZ 0
How to do this ?
Thanks a lot
Your for loop is what's causing your issues. You end up creating a big list that is somewhat difficult to perform calculations on (check out names(w) to see what I mean). A better data structure would allow for easier calculations:
x = NULL #initialize
for (i in 1:1000) {
mysamp = sample(mydata$Color,size=30) #sample
mysamp = data.frame(table(mysamp)) #frequency
x = rbind(x, mysamp) #bind to x
}
aggregate(Freq~mysamp, data = x, FUN = sum) #perform calculation
Note that this loop runs a bit slower than your loop. This is because of the rbind() function. See this post. Maybe someone will come along with a more efficient solution.