Related
I am trying to write a code that finds the 3 consecutives months that are the coldest.
For now I have written a code for the 3 first months (1,2,3) but then it should be applied to (4,5,6), (7,8,9), (10,11,12), (2,3,4), (5,6,7), (8,9,10), (11,12,1), (3,4,5), (6,7,8), (9,10,11) and (12,1,2) which are all the possible combinations of 3 consecutives months.
The code I wrote is here :
cold <- data_example %>%
group_by(Site) %>%
filter(Month %in% c(1,2,3)) %>%
mutate(mean_temperature = mean(t_q)) %>%
dplyr::select(-c(t_q,Month)) %>%
distinct(Site, mean_temperature)
average_temp_month_1_2_3 <- cold$mean_temperature
Then I replaced the c(1,2,3) by all possiblities, I have created a new column for each output.
I end up with a dataset with row corresponding to Site and columns are all the possibilities of 3 consecutive months.
After I took the min value for each row using the function apply() and min() and it gives me the coldest quarter for each Site.
I am looking for a way to generalize it, like creating a loop on the possiblities.
The structure of data_example is as follow :
structure(list(Site = c(4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 13L,
13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 14L, 14L,
14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 15L, 15L, 15L,
15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 17L, 17L, 17L, 17L, 17L,
17L, 17L, 17L, 17L, 17L, 17L, 17L, 18L, 18L, 18L, 18L, 18L, 18L,
18L, 18L, 18L, 18L, 18L, 18L, 25L, 25L, 25L, 25L, 25L, 25L, 25L,
25L, 25L, 25L, 25L, 25L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L,
26L, 26L, 26L, 26L), Month = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L,
12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L,
12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 11L, 12L), t_q = c(9.67754848470332, -6.74555496540183,
5.67969761460384, 12.537207581471, -9.4899105618945, 21.0747672424502,
15.2643039243614, -3.62839910494421, 11.3919028351455, 1.69988257436554,
4.22015024307287, 11.7045830784212, 8.91437673833493, 0.579081429509138,
-10.8207481229903, 7.05356868592628, 13.0911580912516, 17.2032089167605,
-2.47642708849114, -11.2105599344486, 33.986736305027, 17.8578689773214,
-14.9114468266335, 14.4681380389141, 0.568074240873411, 7.65458408777801,
1.91368344556659, 6.01571556896127, 11.4858297513536, 2.2608458985328,
-2.08200762781776, 12.1540989284163, 20.9941815285413, 0.375777604316208,
-2.7137027317614, -6.17690210400591, 11.2549857164403, 17.447156776654,
-6.96565197389579, -5.41542361226991, 11.1680111873065, 16.2266522778922,
-11.4503938582433, 5.93300314835716, -18.2818398656237, 16.2930210946949,
9.80219192652316, -0.48237356523527, 7.72680942503686, 5.84113084181759,
9.66129413490096, -4.53018262186904, 7.42187509892118, 9.2559478576895,
8.25120948667013, 8.18182063263247, 16.3703081943971, 19.5469951420341,
3.71888263185773, -0.150179891749435, 1.32057298670562, -5.63556532224354,
21.3918542474341, 4.58752188336035, 5.49430262894033, 5.99587512047837,
-3.76459024109216, -8.53522098071824, 8.01805680562232, 26.2227490426066,
8.90822434139878, 5.04259034084471, 6.89740304247746, 11.9484584922927,
-11.5085102739471, 30.4526759119379, 21.878533782357, -5.39936677076962,
-9.83965056853816, 19.3083455159472, 7.90653548036154, 3.11876660277767,
-8.85027083180008, -9.9225496831988, 5.97307112581907, -2.83528336599284,
-2.75758002814396, 4.68388181004449, 6.61649031537118, -6.65988084338133,
-0.981075313384259, 5.84898952305179, -5.20962191660178, 0.416662319713158,
-10.5336993269853, 19.5350642296553, 26.9696625385792, 15.3291059661081,
15.0799591208354, 13.2310653499033, 7.2053382722482, -7.87288386491102,
20.8083797469715, 6.16664220270041, 8.3360949793043, -14.4000921795463,
-10.5503025782944, 14.3185205291177, 5.83802399796341, 2.49660818997943,
15.7399297014092, -0.834086173817971, 12.4883230222372, 6.73548467376379,
7.7988835803825, -5.13583355913738, 7.51054162811707, 11.6610602814336,
-11.8864185954223, 4.2704440943851)), row.names = c(NA, -120L
), groups = structure(list(Site = c(4L, 5L, 13L, 14L, 15L, 16L,
17L, 18L, 25L, 26L), .rows = structure(list(1:12, 13:24, 25:36,
37:48, 49:60, 61:72, 73:84, 85:96, 97:108, 109:120), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -10L), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
You can use raster::movingFun to do a moving average with circular data, then use slice_min to get the minimum value per group.
library(dplyr)
circ <- function(x, by) ifelse(x%%by == 0, by, x%%by)
df %>%
group_by(Site) %>%
mutate(rolmean = raster::movingFun(t_q, n = 3, fun = mean, circular = TRUE)) %>%
slice_min(rolmean) %>%
mutate(coldest = toString(circ(c(Month-1, Month, Month+1), 12)))
output
# A tibble: 10 × 5
# Groups: Site [10]
Site Month t_q rolmean coldest
<int> <int> <dbl> <dbl> <chr>
1 4 2 -6.75 2.87 1, 2, 3
2 5 3 -10.8 -1.06 2, 3, 4
3 13 11 -2.71 -2.84 10, 11, 12
4 14 8 5.93 -7.93 7, 8, 9
5 15 3 9.66 3.66 2, 3, 4
6 16 7 -3.76 -2.10 6, 7, 8
7 17 11 -8.85 -5.22 10, 11, 12
8 18 10 0.417 -5.11 9, 10, 11
9 25 10 -14.4 -5.54 9, 10, 11
10 26 12 4.27 -0.593 11, 12, 1
Using which.min in aggregate on a moving average window.
aggregate(t_q ~ Site, dat, \(s) {
win <- 3 ## window length
sq <- Map(seq, 1:(length(s) - win + 1), win:length(s))
toString(sq[[which.min(sapply(sq, \(sq) mean(s[sq])))]])
})
# Site t_q
# 1 4 1, 2, 3
# 2 5 2, 3, 4
# 3 13 10, 11, 12
# 4 14 7, 8, 9
# 5 15 2, 3, 4
# 6 16 6, 7, 8
# 7 17 10, 11, 12
# 8 18 9, 10, 11
# 9 25 9, 10, 11
# 10 26 10, 11, 12
I have the following data frame with ages binned in ranges of 5 years and the frequency of a condition happening in males/females. The problem is that there were no occurrences in either gender for example in the range 15-20.
structure(list(age = structure(c(1L, 2L, 3L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L), .Label = c("[0,5]",
"(5,10]", "(10,15]", "(15,20]", "(20,25]", "(25,30]", "(30,35]",
"(35,40]", "(40,45]", "(45,50]", "(50,55]", "(55,60]", "(60,65]",
"(65,70]", "(70,75]", "(75,80]", "(80,85]", "(85,90]", "(90,95]",
"(95,100]"), class = "factor"), male = c(2L, 1L, 1L, 4L, 8L,
9L, 20L, 33L, 49L, 104L, 112L, 176L, 159L, 140L, 94L, 72L, 32L,
6L, 2L), female = c(1L, 1L, NA, 7L, 7L, 4L, 23L, 39L, 44L, 74L,
94L, 111L, 124L, 129L, 110L, 92L, 76L, 30L, 7L)), row.names = c(NA,
-19L), groups = structure(list(age = structure(c(1L, 2L, 3L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L,
19L, 20L), .Label = c("[0,5]", "(5,10]", "(10,15]", "(15,20]",
"(20,25]", "(25,30]", "(30,35]", "(35,40]", "(40,45]", "(45,50]",
"(50,55]", "(55,60]", "(60,65]", "(65,70]", "(70,75]", "(75,80]",
"(80,85]", "(85,90]", "(90,95]", "(95,100]"), class = "factor"),
.rows = structure(list(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -19L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
If I check the levels it properly shows all levels.
what I would want is a data frame where all ranges of ages show up and when they don't exist substitute then by 0.
You can use complete :
library(dplyr)
library(tidyr)
df %>%
ungroup %>%
complete(age, fill = list(male = 0, female = 0))
# age male female
# <fct> <dbl> <dbl>
# 1 [0,5] 2 1
# 2 (5,10] 1 1
# 3 (10,15] 1 0
# 4 (15,20] 0 0
# 5 (20,25] 4 7
# 6 (25,30] 8 7
# 7 (30,35] 9 4
# 8 (35,40] 20 23
# 9 (40,45] 33 39
#10 (45,50] 49 44
#11 (50,55] 104 74
#12 (55,60] 112 94
#13 (60,65] 176 111
#14 (65,70] 159 124
#15 (70,75] 140 129
#16 (75,80] 94 110
#17 (80,85] 72 92
#18 (85,90] 32 76
#19 (90,95] 6 30
#20 (95,100] 2 7
THis is the first time i am working on time series, hence kindly pardon me for anything wrong in my approach.
I have monthly sales data for multiple Groups. THe data is for 3 years, and i would like to implement time series analysis for the same. I am not sure if 3 years data is actually good enough, but however i would like to understand it better.
I currently understand that the time series is decomposed into three parts- Trend, Seasonality and Random.
I want to split my Sales data for each Group, into the Trend, Seasonality and Random part. Since trend and seasonality are gone, hence i want to use only random to understand the Sales metrics better.
Since data is monthly, hence i need to use multiplicative.
Should i use STL or decompose?
I have the basic Decompose code, however not sure how to incorporate the same for multiple groups, to identify the trend, seasonality and random for each group.
I am not referring to ARIMA model. I am basically referring to the standard time series approach.
Below is how my data looks like.
Group Date Month Sales
Group1 Jan-15 1 75030
Group1 Feb-15 2 16073
Group1 Mar-15 3 17161
Group1 Apr-15 4 94946
Group1 May-15 5 62999
Group1 Jun-15 6 4698
Group1 Jul-15 7 76743
Group1 Aug-15 8 28800
Group1 Sep-15 9 12225
Group1 Oct-15 10 71793
Group1 Nov-15 11 26686
Group1 Dec-15 12 6252
Group1 Jan-16 13 82698
Group1 Feb-16 14 71201
Group1 Mar-16 15 65798
Group1 Apr-16 16 4407
Group1 May-16 17 7491
Group1 Jun-16 18 24366
Group1 Jul-16 19 99616
Group1 Aug-16 20 74443
Group1 Sep-16 21 54122
Group1 Oct-16 22 20762
Group1 Nov-16 23 91376
Group1 Dec-16 24 18693
Group1 Jan-17 25 30395
Group1 Feb-17 26 82049
Group1 Mar-17 27 79701
Group1 Apr-17 28 38862
Group1 May-17 29 84802
Group1 Jun-17 30 81715
Group1 Jul-17 31 60786
Group1 Aug-17 32 88731
Group1 Sep-17 33 28502
Group1 Oct-17 34 79245
Group1 Nov-17 35 15553
Group1 Dec-17 36 3237
Group2 Jan-15 1 8990
Group2 Feb-15 2 47516
Group2 Mar-15 3 15076
Group2 Apr-15 4 60888
Group2 May-15 5 47111
Group2 Jun-15 6 7770
Group2 Jul-15 7 25080
Group2 Aug-15 8 46586
Group2 Sep-15 9 12595
Group2 Oct-15 10 71883
Group2 Nov-15 11 21634
Group2 Dec-15 12 78799
Group2 Jan-16 13 57596
Group2 Feb-16 14 35685
Group2 Mar-16 15 68518
Group2 Apr-16 16 35661
Group2 May-16 17 65294
Group2 Jun-16 18 62602
Group2 Jul-16 19 13506
Group2 Aug-16 20 49215
Group2 Sep-16 21 32008
Group2 Oct-16 22 27924
Group2 Nov-16 23 56146
Group2 Dec-16 24 23975
Group2 Jan-17 25 18686
Group2 Feb-17 26 77076
Group2 Mar-17 27 63992
Group2 Apr-17 28 38087
Group2 May-17 29 19846
Group2 Jun-17 30 46823
Group2 Jul-17 31 11035
Group2 Aug-17 32 73686
Group2 Sep-17 33 35523
Group2 Oct-17 34 97417
Group2 Nov-17 35 27954
Group2 Dec-17 36 79004
Below is my code.
x <- ts(df, start = c(2015, 1), end = c(2017, 12), frequency = 12)
m <- decompose(x)
Please correct me if there is something wrong in my approach, since I am new to time series modelling.
Thanks,
Jay
The first column is a factor hence you can use tapply function to extract time series by Group. The results will be stored in list. Than you can use lapply with agruments: list of time series and function decompose.
To access the results of decomposition you can index the list, e.g. dcs[[1]] will extract decompostion for Group 1.
Data:
df <- structure(list(Group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("Group1", "Group2"), class = "factor"), Date = structure(c(13L,
10L, 22L, 1L, 25L, 19L, 16L, 4L, 34L, 31L, 28L, 7L, 14L, 11L,
23L, 2L, 26L, 20L, 17L, 5L, 35L, 32L, 29L, 8L, 15L, 12L, 24L,
3L, 27L, 21L, 18L, 6L, 36L, 33L, 30L, 9L, 13L, 10L, 22L, 1L,
25L, 19L, 16L, 4L, 34L, 31L, 28L, 7L, 14L, 11L, 23L, 2L, 26L,
20L, 17L, 5L, 35L, 32L, 29L, 8L, 15L, 12L, 24L, 3L, 27L, 21L,
18L, 6L, 36L, 33L, 30L, 9L), .Label = c("Apr-15", "Apr-16", "Apr-17",
"Aug-15", "Aug-16", "Aug-17", "Dec-15", "Dec-16", "Dec-17", "Feb-15",
"Feb-16", "Feb-17", "Jan-15", "Jan-16", "Jan-17", "Jul-15", "Jul-16",
"Jul-17", "Jun-15", "Jun-16", "Jun-17", "Mar-15", "Mar-16", "Mar-17",
"May-15", "May-16", "May-17", "Nov-15", "Nov-16", "Nov-17", "Oct-15",
"Oct-16", "Oct-17", "Sep-15", "Sep-16", "Sep-17"), class = "factor"),
Month = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L,
13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L,
25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L,
27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L), Sales = c(75030L,
16073L, 17161L, 94946L, 62999L, 4698L, 76743L, 28800L, 12225L,
71793L, 26686L, 6252L, 82698L, 71201L, 65798L, 4407L, 7491L,
24366L, 99616L, 74443L, 54122L, 20762L, 91376L, 18693L, 30395L,
82049L, 79701L, 38862L, 84802L, 81715L, 60786L, 88731L, 28502L,
79245L, 15553L, 3237L, 8990L, 47516L, 15076L, 60888L, 47111L,
7770L, 25080L, 46586L, 12595L, 71883L, 21634L, 78799L, 57596L,
35685L, 68518L, 35661L, 65294L, 62602L, 13506L, 49215L, 32008L,
27924L, 56146L, 23975L, 18686L, 77076L, 63992L, 38087L, 19846L,
46823L, 11035L, 73686L, 35523L, 97417L, 27954L, 79004L)), class = "data.frame", row.names = c(NA,
-72L))
Code:
tss <- tapply(df$Sales, df$Group, ts, start = c(2015, 1), frequency = 12)
dcs <- lapply(tss, decompose)
I am relatively new to R; and, I need help with a user defined function. I would like to see where each observation of a data frame ranks in a subset of similar observations of the same data frame. I'm having trouble referencing the original observation, in order to extract its rank, within my function.
Here is a sample of my data:
> dput(df)
structure(list(Name = c("Alex Abrines", "Steven Adams", "Cole Aldrich",
"LaMarcus Aldridge", "Kyle Anderson", "Ryan Anderson", "Giannis Antetokounmpo",
"Carmelo Anthony", "OG Anunoby", "Darrell Arthur", "Will Barton",
"Bradley Beal", "Davis Bertans", "Nemanja Bjelica", "Malcolm Brogdon",
"Aaron Brooks", "Dillon Brooks", "Lorenzo Brown", "Sterling Brown",
"Reggie Bullock", "Jimmy Butler", "Dwight Buycks", "Clint Capela",
"Wilson Chandler", "Torrey Craig", "Jamal Crawford", "Deyonta Davis",
"Matthew Dellavedova", "DeMar DeRozan", "Gorgui Dieng", "Andre Drummond",
"James Ennis", "Kenneth Faried", "Raymond Felton", "Terrance Ferguson",
"Bryn Forbes", "Tim Frazier", "Langston Galloway", "Marc Gasol",
"Pau Gasol", "Paul George", "Marcus Georges-Hunt", "Taj Gibson",
"Manu Ginobili", "Marcin Gortat", "Jerami Grant", "Danny Green",
"Gerald Green", "JaMychal Green", "Blake Griffin", "James Harden",
"Gary Harris", "Andrew Harrison", "Myke Henry", "John Henson",
"Nene Hilario", "Darrun Hilliard", "Josh Huestis", "Serge Ibaka",
"Stanley Johnson", "Nikola Jokic", "Tyus Jones", "Luke Kennard",
"Sean Kilpatrick", "Joffrey Lauvergne", "Kyle Lowry", "Trey Lyles",
"Ian Mahinmi", "Thon Maker", "Jarell Martin", "Luc Mbah a Moute",
"Ben McLemore", "Jodie Meeks", "Khris Middleton", "Patty Mills",
"Eric Moreland", "Markieff Morris", "Emmanuel Mudiay", "Shabazz Muhammad",
"Xavier Munford", "Dejounte Murray", "Jamal Murray", "Lucas Nogueira",
"Kelly Oubre", "Tony Parker", "Patrick Patterson", "Brandon Paul",
"Chris Paul", "Marshall Plumlee", "Jakob Poeltl", "Otto Porter",
"Norman Powell", "Willie Reed", "Tomas Satoransky", "Mike Scott",
"Wayne Selden", "Pascal Siakam", "Ish Smith", "Tony Snell", "Jeff Teague",
"Anthony Tolliver", "Karl-Anthony Towns", "P.J. Tucker", "Jonas Valanciunas",
"Rashad Vaughn", "Russell Westbrook", "Andrew Wiggins", "D.J. Wilson",
"Delon Wright"), Pos = structure(c(5L, 1L, 1L, 1L, 3L, 2L, 3L,
2L, 2L, 2L, 4L, 4L, 2L, 2L, 4L, 4L, 5L, 4L, 4L, 5L, 3L, 4L, 1L,
2L, 5L, 4L, 1L, 4L, 5L, 1L, 1L, 2L, 2L, 4L, 5L, 4L, 4L, 4L, 1L,
1L, 2L, 4L, 2L, 4L, 1L, 2L, 5L, 5L, 2L, 2L, 4L, 4L, 4L, 2L, 1L,
1L, 4L, 2L, 1L, 2L, 1L, 4L, 4L, 4L, 1L, 4L, 2L, 1L, 1L, 2L, 2L,
4L, 4L, 3L, 4L, 1L, 2L, 4L, 3L, 4L, 4L, 4L, 1L, 2L, 4L, 2L, 4L,
4L, 1L, 1L, 2L, 4L, 1L, 4L, 2L, 5L, 2L, 4L, 5L, 4L, 1L, 1L, 2L,
1L, 4L, 4L, 3L, 2L, 4L), .Label = c("C", "PF", "SF", "PG", "SG"
), class = "factor"), Date = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "2018-02-01 *", class = "factor"),
Tm = structure(c(7L, 7L, 6L, 8L, 8L, 3L, 5L, 7L, 9L, 1L,
1L, 10L, 8L, 6L, 5L, 6L, 4L, 9L, 5L, 2L, 6L, 2L, 3L, 1L,
1L, 6L, 4L, 5L, 9L, 6L, 2L, 4L, 1L, 7L, 7L, 8L, 10L, 2L,
4L, 8L, 7L, 6L, 6L, 8L, 10L, 7L, 8L, 3L, 4L, 2L, 3L, 1L,
4L, 4L, 5L, 3L, 8L, 7L, 9L, 2L, 1L, 6L, 2L, 5L, 8L, 9L, 1L,
10L, 5L, 4L, 3L, 4L, 10L, 5L, 8L, 2L, 10L, 1L, 6L, 5L, 8L,
1L, 9L, 10L, 8L, 7L, 8L, 3L, 5L, 9L, 10L, 9L, 2L, 10L, 10L,
4L, 9L, 2L, 5L, 6L, 2L, 6L, 3L, 9L, 5L, 7L, 6L, 5L, 9L), .Label = c("DEN",
"DET", "HOU", "MEM", "MIL", "MIN", "OKC", "SAS", "TOR", "WAS"
), class = "factor"), Opp = structure(c(1L, 1L, 5L, 3L, 3L,
8L, 6L, 1L, 10L, 7L, 7L, 9L, 3L, 5L, 6L, 5L, 2L, 10L, 6L,
4L, 5L, 4L, 8L, 7L, 7L, 5L, 2L, 6L, 10L, 5L, 4L, 2L, 7L,
1L, 1L, 3L, 9L, 4L, 2L, 3L, 1L, 5L, 5L, 3L, 9L, 1L, 3L, 8L,
2L, 4L, 8L, 7L, 2L, 2L, 6L, 8L, 3L, 1L, 10L, 4L, 7L, 5L,
4L, 6L, 3L, 10L, 7L, 9L, 6L, 2L, 8L, 2L, 9L, 6L, 3L, 4L,
9L, 7L, 5L, 6L, 3L, 7L, 10L, 9L, 3L, 1L, 3L, 8L, 6L, 10L,
9L, 10L, 4L, 9L, 9L, 2L, 10L, 4L, 6L, 5L, 4L, 5L, 8L, 10L,
6L, 1L, 5L, 6L, 10L), .Label = c("DEN", "DET", "HOU", "MEM",
"MIL", "MIN", "OKC", "SAS", "TOR", "WAS"), class = "factor"),
MP = c(29L, 32L, 3L, 34L, 30L, 29L, 36L, 34L, 21L, 1L, 36L,
38L, 13L, 14L, 10L, 3L, 32L, 11L, 24L, 35L, 40L, 19L, 35L,
34L, 22L, 17L, 15L, 25L, 38L, 13L, 28L, 15L, 10L, 14L, 4L,
18L, 17L, 4L, 33L, 20L, 36L, 6L, 33L, 20L, 26L, 25L, 28L,
30L, 20L, 35L, 37L, 38L, 34L, 22L, 32L, 13L, 8L, 12L, 35L,
36L, 37L, 17L, 21L, 18L, 2L, 35L, 15L, 19L, 13L, 28L, 35L,
10L, 9L, 35L, 24L, 5L, 32L, 14L, 3L, 7L, 24L, 34L, 3L, 23L,
17L, 15L, 2L, 30L, 5L, 16L, 29L, 26L, 5L, 28L, 19L, 31L,
13L, 29L, 29L, 28L, 22L, 33L, 31L, 29L, 4L, 39L, 30L, 4L,
13L), Player.ID = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L,
20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L,
32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 42L, 41L, 43L,
44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 53L, 52L, 54L, 55L,
56L, 57L, 58L, 59L, 60L, 61L, 62L, 63L, 64L, 65L, 66L, 67L,
68L, 69L, 70L, 71L, 72L, 73L, 74L, 75L, 76L, 77L, 78L, 79L,
80L, 81L, 82L, 83L, 84L, 85L, 86L, 87L, 88L, 89L, 90L, 91L,
92L, 93L, 94L, 95L, 96L, 97L, 98L, 99L, 100L, 101L, 102L,
103L, 104L, 105L, 106L, 107L, 108L, 109L), .Label = c("abrinal01",
"adamsst01", "aldrico01", "aldrila01", "anderky01", "anderry01",
"antetgi01", "anthoca01", "anunoog01", "arthuda01", "bartowi01",
"bealbr01", "bertada01", "bjeline01", "brogdma01", "brookaa01",
"brookdi01", "brownlo01", "brownst02", "bullore01", "butleji01",
"buyckdw01", "capelca01", "chandwi01", "craigto01", "crawfja01",
"davisde01", "dellama01", "derozde01", "dienggo01", "drumman01",
"ennisja01", "farieke01", "feltora01", "fergute01", "forbebr01",
"fraziti01", "gallola01", "gasolma01", "gasolpa01", "georgma01",
"georgpa01", "gibsota01", "ginobma01", "gortama01", "grantje01",
"greenda02", "greenge01", "greenja01", "griffbl01", "hardeja01",
"harrian01", "harriga01", "henrymy01", "hensojo01", "hilarne01",
"hillida01", "huestjo01", "ibakase01", "johnsst04", "jokicni01",
"jonesty01", "kennalu01", "kilpase01", "lauvejo01", "lowryky01",
"lylestr01", "mahinia01", "makerth01", "martija01", "mbahalu01",
"mclembe01", "meeksjo01", "middlkh01", "millspa02", "moreler01",
"morrima02", "mudiaem01", "muhamsh01", "munfoxa02", "murrade01",
"murraja01", "noguelu01", "oubreke01", "parketo01", "pattepa01",
"paulbr01", "paulch01", "plumlma02", "poeltja01", "porteot01",
"powelno01", "reedwi02", "satorto01", "scottmi01", "seldewa01",
"siakapa01", "smithis01", "snellto01", "teaguje01", "tollian01",
"townska01", "tuckepj01", "valanjo01", "vaughra01", "westbru01",
"wiggian01", "wilsodj01", "wrighde01"), class = "factor"),
Game.ID = structure(c(7L, 7L, 6L, 8L, 8L, 3L, 5L, 7L, 9L,
1L, 1L, 10L, 8L, 6L, 5L, 6L, 4L, 9L, 5L, 2L, 6L, 2L, 3L,
1L, 1L, 6L, 4L, 5L, 9L, 6L, 2L, 4L, 1L, 7L, 7L, 8L, 10L,
2L, 4L, 8L, 7L, 6L, 6L, 8L, 10L, 7L, 8L, 3L, 4L, 2L, 3L,
1L, 4L, 4L, 5L, 3L, 8L, 7L, 9L, 2L, 1L, 6L, 2L, 5L, 8L, 9L,
1L, 10L, 5L, 4L, 3L, 4L, 10L, 5L, 8L, 2L, 10L, 1L, 6L, 5L,
8L, 1L, 9L, 10L, 8L, 7L, 8L, 3L, 5L, 9L, 10L, 9L, 2L, 10L,
10L, 4L, 9L, 2L, 5L, 6L, 2L, 6L, 3L, 9L, 5L, 7L, 6L, 5L,
9L), .Label = c("2018-02-01 * DEN", "2018-02-01 * DET", "2018-02-01 * HOU",
"2018-02-01 * MEM", "2018-02-01 * MIL", "2018-02-01 * MIN",
"2018-02-01 * OKC", "2018-02-01 * SAS", "2018-02-01 * TOR",
"2018-02-01 * WAS"), class = "factor")), .Names = c("Name",
"Pos", "Date", "Tm", "Opp", "MP", "Player.ID", "Game.ID"), class = "data.frame", row.names = c(NA,
109L))
I would like to write a function that, for each observation:
> df[1, ]
Name Pos Date Tm Opp MP Player.ID Game.ID
1 Alex Abrines SG 2018-02-01 * OKC DEN 29 abrinal01 2018-02-01 * OKC
creates a subset of all other observations with a matching df$Game.ID.
> df[df$Game.ID == '2018-02-01 * OKC', ]
Name Pos Date Tm Opp MP Player.ID Game.ID
1 Alex Abrines SG 2018-02-01 * OKC DEN 29 abrinal01 2018-02-01 * OKC
2 Steven Adams C 2018-02-01 * OKC DEN 32 adamsst01 2018-02-01 * OKC
8 Carmelo Anthony PF 2018-02-01 * OKC DEN 34 anthoca01 2018-02-01 * OKC
34 Raymond Felton PG 2018-02-01 * OKC DEN 14 feltora01 2018-02-01 * OKC
35 Terrance Ferguson SG 2018-02-01 * OKC DEN 4 fergute01 2018-02-01 * OKC
41 Paul George PF 2018-02-01 * OKC DEN 36 georgpa01 2018-02-01 * OKC
46 Jerami Grant PF 2018-02-01 * OKC DEN 25 grantje01 2018-02-01 * OKC
58 Josh Huestis PF 2018-02-01 * OKC DEN 12 huestjo01 2018-02-01 * OKC
86 Patrick Patterson PF 2018-02-01 * OKC DEN 15 pattepa01 2018-02-01 * OKC
106 Russell Westbrook PG 2018-02-01 * OKC DEN 39 westbru01 2018-02-01 * OKC
and then returns the rank of the original observation's df$MP
> df[1, c('MP')]
[1] 29
in the hierarchy of the new subset.
> xx <- data.frame(cbind(sort(df[df$Game.ID == '2018-02-01 * OKC', c('MP')], decreasing = TRUE), rownames(data.table(sort(df[df$Game.ID == '2018-02-01 * OKC', c('MP')], decreasing = TRUE)))))
> xx
X1 X2
1 39 1
2 36 2
3 34 3
4 32 4
5 29 5
6 25 6
7 15 7
8 14 8
9 12 9
10 4 10
> colnames(xx) <- c('MP', 'Depth.Chart')
> yy <- df[df$Game.ID == '2018-02-01 * OKC', ]
> yy
Name Pos Date Tm Opp MP Player.ID
1 Alex Abrines SG 2018-02-01 * OKC DEN 29 abrinal01
2 Steven Adams C 2018-02-01 * OKC DEN 32 adamsst01
8 Carmelo Anthony PF 2018-02-01 * OKC DEN 34 anthoca01
34 Raymond Felton PG 2018-02-01 * OKC DEN 14 feltora01
35 Terrance Ferguson SG 2018-02-01 * OKC DEN 4 fergute01
41 Paul George PF 2018-02-01 * OKC DEN 36 georgpa01
46 Jerami Grant PF 2018-02-01 * OKC DEN 25 grantje01
58 Josh Huestis PF 2018-02-01 * OKC DEN 12 huestjo01
86 Patrick Patterson PF 2018-02-01 * OKC DEN 15 pattepa01
106 Russell Westbrook PG 2018-02-01 * OKC DEN 39 westbru01
Game.ID
1 2018-02-01 * OKC
2 2018-02-01 * OKC
8 2018-02-01 * OKC
34 2018-02-01 * OKC
35 2018-02-01 * OKC
41 2018-02-01 * OKC
46 2018-02-01 * OKC
58 2018-02-01 * OKC
86 2018-02-01 * OKC
106 2018-02-01 * OKC
> zz <- merge(yy, xx, all.x = TRUE)
> zz
MP Name Pos Date Tm Opp Player.ID
1 4 Terrance Ferguson SG 2018-02-01 * OKC DEN fergute01
2 12 Josh Huestis PF 2018-02-01 * OKC DEN huestjo01
3 14 Raymond Felton PG 2018-02-01 * OKC DEN feltora01
4 15 Patrick Patterson PF 2018-02-01 * OKC DEN pattepa01
5 25 Jerami Grant PF 2018-02-01 * OKC DEN grantje01
6 29 Alex Abrines SG 2018-02-01 * OKC DEN abrinal01
7 32 Steven Adams C 2018-02-01 * OKC DEN adamsst01
8 34 Carmelo Anthony PF 2018-02-01 * OKC DEN anthoca01
9 36 Paul George PF 2018-02-01 * OKC DEN georgpa01
10 39 Russell Westbrook PG 2018-02-01 * OKC DEN westbru01
Game.ID Depth.Chart
1 2018-02-01 * OKC 10
2 2018-02-01 * OKC 9
3 2018-02-01 * OKC 8
4 2018-02-01 * OKC 7
5 2018-02-01 * OKC 6
6 2018-02-01 * OKC 5
7 2018-02-01 * OKC 4
8 2018-02-01 * OKC 3
9 2018-02-01 * OKC 2
10 2018-02-01 * OKC 1
Finally, I need to extract the value of zz$Depth.Chart that corresponds to the original observation, 5.
> zz[zz$MP == 29, c('Depth.Chart')]
[1] 5
Levels: 1 10 2 3 4 5 6 7 8 9
I would like to define a function that executes the laborious and messy steps above for each observation in a data frame and returns a vector of the results. How can I reference the value of df$MP that corresponds to the observation I'm working on without explicitly calling it 29, like I do above? Here are a few of the thing I've tried, unsuccessfully.
> f1 <- function(col1, df, col2){
+ lapply(col1, function(i){
+ df2 <- df[col1 == i, col2]
+ df3 <- data.frame(cbind(sort(df2, decreasing = TRUE), rownames(data.table(sort(df2, decreasing = TRUE)))))
+ df3[i, 2]
+ })}
> f1(df$Game.ID, df, c('MP'))[1:10]
[[1]]
[1] 7
Levels: 1 10 2 3 4 5 6 7 8 9
[[2]]
[1] 7
Levels: 1 10 2 3 4 5 6 7 8 9
[[3]]
[1] 6
Levels: 1 10 11 12 13 2 3 4 5 6 7 8 9
[[4]]
[1] 8
Levels: 1 10 11 12 13 2 3 4 5 6 7 8 9
[[5]]
[1] 8
Levels: 1 10 11 12 13 2 3 4 5 6 7 8 9
[[6]]
[1] 3
Levels: 1 2 3 4 5 6 7 8
[[7]]
[1] 5
Levels: 1 10 11 12 13 2 3 4 5 6 7 8 9
[[8]]
[1] 7
Levels: 1 10 2 3 4 5 6 7 8 9
[[9]]
[1] 9
Levels: 1 10 11 2 3 4 5 6 7 8 9
[[10]]
[1] 1
Levels: 1 10 2 3 4 5 6 7 8 9
> f1 <- function(col1, df, col2){
+ lapply(col1, function(i){
+ df2 <- df[col1 == i, col2]
+ df3 <- data.frame(cbind(sort(df2, decreasing = TRUE), rownames(data.table(sort(df2, decreasing = TRUE)))))
+ df3[df3$X1 == i, 2]
+ })}
> f1(df$Game.ID, df, c('MP'))
Hide Traceback
Rerun with Debug
Error in Ops.factor(df3$X1, i) : level sets of factors are different
7.
stop("level sets of factors are different")
6.
Ops.factor(df3$X1, i)
5.
`[.data.frame`(df3, df3$X1 == i, 2)
4.
df3[df3$X1 == i, 2]
3.
FUN(X[[i]], ...)
2.
lapply(col1, function(i) {
df2 <- df[col1 == i, col2]
df3 <- data.frame(cbind(sort(df2, decreasing = TRUE), rownames(data.table(sort(df2,
decreasing = TRUE))))) ...
1.
f1(df$Game.ID, df, c("MP"))
> f1 <- function(col1, df, col2){
+ lapply(col1, function(i){
+ df2 <- df[col1 == i, col2]
+ df3 <- data.frame(cbind(sort(df2, decreasing = TRUE), rownames(data.table(sort(df2, decreasing = TRUE)))))
+ df3[col2 == i, 2]
+ })}
> f1(df$Game.ID, df, c('MP'))[1:10]
[[1]]
factor(0)
Levels: 1 10 2 3 4 5 6 7 8 9
[[2]]
factor(0)
Levels: 1 10 2 3 4 5 6 7 8 9
[[3]]
factor(0)
Levels: 1 10 11 12 13 2 3 4 5 6 7 8 9
[[4]]
factor(0)
Levels: 1 10 11 12 13 2 3 4 5 6 7 8 9
[[5]]
factor(0)
Levels: 1 10 11 12 13 2 3 4 5 6 7 8 9
[[6]]
factor(0)
Levels: 1 2 3 4 5 6 7 8
[[7]]
factor(0)
Levels: 1 10 11 12 13 2 3 4 5 6 7 8 9
[[8]]
factor(0)
Levels: 1 10 2 3 4 5 6 7 8 9
[[9]]
factor(0)
Levels: 1 10 11 2 3 4 5 6 7 8 9
[[10]]
factor(0)
Levels: 1 10 2 3 4 5 6 7 8 9
I guess I don't fully understand how R treats this i variable inside the function; or, therefore, how reference it appropriately. In looking through this forum, I found generic examples of nesting functions inside of functions in Python but not in R. Any help would be much appreciated.
EDIT
Here is a simpler subset of my data:
> dput(df)
structure(list(MP = c(29L, 32L, 3L, 34L, 14L, 3L, 40L, 17L, 13L,
14L, 4L, 36L, 6L, 33L, 25L, 12L, 17L, 3L, 15L, 28L, 33L, 39L,
30L), Player.ID = structure(c(1L, 2L, 3L, 8L, 14L, 16L, 21L,
26L, 30L, 34L, 35L, 42L, 41L, 43L, 46L, 58L, 62L, 79L, 86L, 100L,
102L, 106L, 107L), .Label = c("abrinal01", "adamsst01", "aldrico01",
"aldrila01", "anderky01", "anderry01", "antetgi01", "anthoca01",
"anunoog01", "arthuda01", "bartowi01", "bealbr01", "bertada01",
"bjeline01", "brogdma01", "brookaa01", "brookdi01", "brownlo01",
"brownst02", "bullore01", "butleji01", "buyckdw01", "capelca01",
"chandwi01", "craigto01", "crawfja01", "davisde01", "dellama01",
"derozde01", "dienggo01", "drumman01", "ennisja01", "farieke01",
"feltora01", "fergute01", "forbebr01", "fraziti01", "gallola01",
"gasolma01", "gasolpa01", "georgma01", "georgpa01", "gibsota01",
"ginobma01", "gortama01", "grantje01", "greenda02", "greenge01",
"greenja01", "griffbl01", "hardeja01", "harrian01", "harriga01",
"henrymy01", "hensojo01", "hilarne01", "hillida01", "huestjo01",
"ibakase01", "johnsst04", "jokicni01", "jonesty01", "kennalu01",
"kilpase01", "lauvejo01", "lowryky01", "lylestr01", "mahinia01",
"makerth01", "martija01", "mbahalu01", "mclembe01", "meeksjo01",
"middlkh01", "millspa02", "moreler01", "morrima02", "mudiaem01",
"muhamsh01", "munfoxa02", "murrade01", "murraja01", "noguelu01",
"oubreke01", "parketo01", "pattepa01", "paulbr01", "paulch01",
"plumlma02", "poeltja01", "porteot01", "powelno01", "reedwi02",
"satorto01", "scottmi01", "seldewa01", "siakapa01", "smithis01",
"snellto01", "teaguje01", "tollian01", "townska01", "tuckepj01",
"valanjo01", "vaughra01", "westbru01", "wiggian01", "wilsodj01",
"wrighde01"), class = "factor"), Game.ID = structure(c(7L, 7L,
6L, 7L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 6L, 6L, 7L, 7L, 6L, 6L,
7L, 6L, 6L, 7L, 6L), .Label = c("2018-02-01 * DEN", "2018-02-01 * DET",
"2018-02-01 * HOU", "2018-02-01 * MEM", "2018-02-01 * MIL", "2018-02-01 * MIN",
"2018-02-01 * OKC", "2018-02-01 * SAS", "2018-02-01 * TOR", "2018-02-01 * WAS"
), class = "factor")), .Names = c("MP", "Player.ID", "Game.ID"
), row.names = c(1L, 2L, 3L, 8L, 14L, 16L, 21L, 26L, 30L, 34L,
35L, 41L, 42L, 43L, 46L, 58L, 62L, 79L, 86L, 100L, 102L, 106L,
107L), class = "data.frame")
You're using data.table for little steps in your process, but you should just use it for the whole thing. It's very convenient for doing operations "by group", in this case using rank() by Game.ID. Using your small sample data:
library(data.table)
setDT(df)
df[, Depth.Chart := rank(-MP), by = Game.ID]
df
# MP Player.ID Game.ID Depth.Chart
# 1: 29 abrinal01 2018-02-01 * OKC 5.0
# 2: 32 adamsst01 2018-02-01 * OKC 4.0
# 3: 3 aldrico01 2018-02-01 * MIN 12.0
# 4: 34 anthoca01 2018-02-01 * OKC 3.0
# 5: 14 bjeline01 2018-02-01 * MIN 8.0
# 6: 3 brookaa01 2018-02-01 * MIN 12.0
# 7: 40 butleji01 2018-02-01 * MIN 1.0
# 8: 17 crawfja01 2018-02-01 * MIN 6.5
# 9: 13 dienggo01 2018-02-01 * MIN 9.0
# 10: 14 feltora01 2018-02-01 * OKC 8.0
# 11: 4 fergute01 2018-02-01 * OKC 10.0
# 12: 36 georgpa01 2018-02-01 * OKC 2.0
# 13: 6 georgma01 2018-02-01 * MIN 10.0
# 14: 33 gibsota01 2018-02-01 * MIN 2.5
# 15: 25 grantje01 2018-02-01 * OKC 6.0
# 16: 12 huestjo01 2018-02-01 * OKC 9.0
# 17: 17 jonesty01 2018-02-01 * MIN 6.5
# 18: 3 muhamsh01 2018-02-01 * MIN 12.0
# 19: 15 pattepa01 2018-02-01 * OKC 7.0
# 20: 28 teaguje01 2018-02-01 * MIN 5.0
# 21: 33 townska01 2018-02-01 * MIN 2.5
# 22: 39 westbru01 2018-02-01 * OKC 1.0
# 23: 30 wiggian01 2018-02-01 * MIN 4.0
# MP Player.ID Game.ID Depth.Chart
rank, by default, averages ties, but see ?rank for other options.
In a dataset which contains many ids, I am only trying to manipulate rows which have id 7 or 9, and leave everything else untouched.
I am trying to conditionally remove a row from 7 or 9 in all instances where there isn't a variable that corresponds to it. So, if in the case of the dput example below, I want to remove the ninth row from id=9 because id=7 does not have an itemcode=2. Vice versa for id=7, I am trying to remove its itemcode=9 because id=9 does not have it.
id client item itemcode unit X2001 X2002 X2003 X2004 X2005 X2006 X2007
...
7 7 Bob eighth 8 100 13 18 15 NA NA NA NA
8 7 Bob ninth 9 100 11 21 10 NA NA NA NA
9 9 Bob_new first 1 100 NA NA NA 23 18 25 18
Code:
structure(list(id = c(7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 10L), client = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L), .Label = c("Bob",
"Bob_new", "Mark"), class = "factor"), item = structure(c(3L,
9L, 4L, 2L, 8L, 7L, 1L, 5L, 3L, 6L, 9L, 4L, 2L, 8L, 7L, 1L, 3L
), .Label = c("eighth", "fifth", "first", "fourth", "ninth",
"second", "seventh", "sixth", "third"), class = "factor"), itemcode = c(1L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 1L
), unit = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L,
100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), X2001 = structure(c(5L,
6L, 1L, 4L, 2L, 5L, 3L, 1L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L
), .Label = c("11", "12", "13", "22", "24", "25", "NA"), class = "factor"),
X2002 = structure(c(4L, 8L, 1L, 3L, 7L, 2L, 5L, 6L, 9L, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L), .Label = c("13", "14", "15",
"17", "18", "21", "22", "24", "NA"), class = "factor"), X2003 = structure(c(5L,
1L, 4L, 2L, 6L, 1L, 3L, 1L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L), .Label = c("10", "11", "15", "19", "23", "24", "NA"), class = "factor"),
X2004 = structure(c(7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 5L, 4L,
2L, 6L, 1L, 3L, 4L, 3L, 4L), .Label = c("11", "14", "15",
"20", "23", "25", "NA"), class = "factor"), X2005 = structure(c(6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 3L, 2L, 4L, 3L, 5L, 3L, 1L, 4L,
3L), .Label = c("11", "13", "18", "19", "25", "NA"), class = "factor"),
X2006 = structure(c(9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 8L, 6L,
1L, 2L, 5L, 3L, 7L, 8L, 4L), .Label = c("10", "15", "18",
"19", "20", "22", "23", "25", "NA"), class = "factor"), X2007 = structure(c(8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 4L, 7L, 6L, 2L, 4L, 1L, 5L, 5L,
3L), .Label = c("12", "13", "16", "18", "19", "21", "24",
"NA"), class = "factor")), .Names = c("id", "client", "item",
"itemcode", "unit", "X2001", "X2002", "X2003", "X2004", "X2005",
"X2006", "X2007"), class = "data.frame", row.names = c(NA, -17L
))
————————————————————————————————————————
ANOTHER SCENARIO:
before:
structure(list(id = c(7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
7L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L), client = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 3L), .Label = c("Bob", "Bob_new", "Mark"), class = "factor"),
item = structure(c(3L, 9L, 10L, 9L, 4L, 2L, 8L, 7L, 7L, 1L,
5L, 3L, 6L, 9L, 4L, 2L, 8L, 7L, 1L, 3L), .Label = c("eighth",
"fifth", "first", "fourth", "ninth", "second", "seventh",
"sixth", "third", "third "), class = "factor"), itemcode = c(1L,
3L, 3L, 3L, 4L, 5L, 6L, 7L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 1L), type = structure(c(1L, 1L, 2L, 3L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("A",
"B", "C"), class = "factor"), unit = c(100L, 100L, 100L,
100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L,
100L, 100L, 100L, 100L, 100L, 100L, 100L), X2001 = c(24L,
25L, 30L, 26L, 11L, 22L, 12L, 25L, 24L, 13L, 11L, NA, NA,
NA, NA, NA, NA, NA, NA, NA), X2002 = c(17L, 24L, 12L, 96L,
13L, 15L, 22L, 21L, 14L, 18L, 21L, NA, NA, NA, NA, NA, NA,
NA, NA, NA), X2003 = c(23L, 10L, 46L, 94L, 19L, 11L, 24L,
19L, 10L, 15L, 10L, NA, NA, NA, NA, NA, NA, NA, NA, NA),
X2004 = c(NA, NA, 43L, 83L, NA, NA, NA, 6L, NA, NA, NA, 23L,
20L, 14L, 25L, 11L, 15L, 20L, 15L, 20L), X2005 = c(NA, NA,
97L, 86L, NA, NA, NA, 17L, NA, NA, NA, 18L, 13L, 19L, 18L,
25L, 18L, 11L, 19L, 18L), X2006 = c(NA, NA, 11L, 91L, NA,
NA, NA, 11L, NA, NA, NA, 25L, 22L, 10L, 15L, 20L, 18L, 23L,
25L, 19L), X2007 = c(NA, NA, 19L, 27L, NA, NA, NA, 15L, NA,
NA, NA, 18L, 24L, 21L, 13L, 18L, 12L, 19L, 19L, 16L)), .Names = c("id",
"client", "item", "itemcode", "type", "unit", "X2001", "X2002",
"X2003", "X2004", "X2005", "X2006", "X2007"), class = "data.frame", row.names = c(NA,
-20L))
after:
structure(list(id = c(7L, 7L, 7L, 7L, 7L, 7L, 9L, 9L, 9L, 9L,
9L, 9L, 10L), client = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 3L), .Label = c("Bob", "Bob_new", "Mark"), class = "factor"),
item = structure(c(2L, 7L, 3L, 1L, 5L, 4L, 2L, 6L, 3L, 1L,
5L, 4L, 2L), .Label = c("fifth", "first", "fourth", "seventh",
"sixth", "third", "third "), class = "factor"), itemcode = c(1L,
3L, 4L, 5L, 6L, 7L, 1L, 3L, 4L, 5L, 6L, 7L, 1L), type = structure(c(1L,
2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L), .Label = c("A",
"B"), class = "factor"), unit = c(100L, 100L, 100L, 100L,
100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), X2001 = c(24L,
10L, 11L, 22L, 12L, 17L, NA, NA, NA, NA, NA, NA, NA), X2002 = c(17L,
87L, 13L, 15L, 22L, 19L, NA, NA, NA, NA, NA, NA, NA), X2003 = c(23L,
47L, 19L, 11L, 24L, 17L, NA, NA, NA, NA, NA, NA, NA), X2004 = c(NA,
28L, NA, NA, NA, 28L, 23L, 14L, 25L, 11L, 15L, 20L, 20L),
X2005 = c(NA, 43L, NA, NA, NA, 16L, 18L, 19L, 18L, 25L, 18L,
11L, 18L), X2006 = c(NA, 69L, NA, NA, NA, 5L, 25L, 10L, 15L,
20L, 18L, 23L, 19L), X2007 = c(NA, 72L, NA, NA, NA, 20L,
18L, 21L, 13L, 18L, 12L, 19L, 16L)), .Names = c("id", "client",
"item", "itemcode", "type", "unit", "X2001", "X2002", "X2003",
"X2004", "X2005", "X2006", "X2007"), class = "data.frame", row.names = c(NA,
-13L))
I could implement the said filter code to remove items which do not exist in its corresponding place (id 7 and 9).
But if there are sub levels for items, like type of item. I am also trying to remove items if they don't have a type similar in the corresponding field.
You could use filter from dplyr
library(dplyr)
filter(df_all, itemcode %in% intersect(itemcode[id==7],
itemcode[id==9])|!id %in% c(7,9) )
# id client item itemcode unit X2001 X2002 X2003 X2004 X2005 X2006 X2007
#1 7 Bob first 1 100 24 17 23 NA NA NA NA
#2 7 Bob third 3 100 25 24 10 NA NA NA NA
#3 7 Bob fourth 4 100 11 13 19 NA NA NA NA
#4 7 Bob fifth 5 100 22 15 11 NA NA NA NA
#5 7 Bob sixth 6 100 12 22 24 NA NA NA NA
#6 7 Bob seventh 7 100 24 14 10 NA NA NA NA
#7 7 Bob eighth 8 100 13 18 15 NA NA NA NA
#8 9 Bob_new first 1 100 NA NA NA 23 18 25 18
#9 9 Bob_new third 3 100 NA NA NA 14 19 10 21
#10 9 Bob_new fourth 4 100 NA NA NA 25 18 15 13
#11 9 Bob_new fifth 5 100 NA NA NA 11 25 20 18
#12 9 Bob_new sixth 6 100 NA NA NA 15 18 18 12
#13 9 Bob_new seventh 7 100 NA NA NA 20 11 23 19
#14 9 Bob_new eighth 8 100 NA NA NA 15 19 25 19
#15 10 Mark first 1 100 NA NA NA 20 18 19 16
Update
Based on the new dataset, perhaps this helps
library(dplyr)
library(tidyr)
dfnew %>%
unite(itemtype, itemcode,type) %>%
filter(itemtype %in% intersect(itemtype[id==7],
itemtype[id==9])|!id %in% c(7,9)) %>%
separate(itemtype, c('itemcode', 'type'))
# id client item itemcode type unit X2001 X2002 X2003 X2004 X2005 X2006
# 1 7 Bob first 1 A 100 24 17 23 NA NA NA
# 2 7 Bob third 3 B 100 30 12 46 43 97 11
# 3 7 Bob fourth 4 A 100 11 13 19 NA NA NA
# 4 7 Bob fifth 5 A 100 22 15 11 NA NA NA
# 5 7 Bob sixth 6 A 100 12 22 24 NA NA NA
# 6 7 Bob seventh 7 A 100 25 21 19 6 17 11
# 7 9 Bob_new first 1 A 100 NA NA NA 23 18 25
# 8 9 Bob_new third 3 B 100 NA NA NA 14 19 10
# 9 9 Bob_new fourth 4 A 100 NA NA NA 25 18 15
# 10 9 Bob_new fifth 5 A 100 NA NA NA 11 25 20
# 11 9 Bob_new sixth 6 A 100 NA NA NA 15 18 18
# 12 9 Bob_new seventh 7 A 100 NA NA NA 20 11 23
# 13 10 Mark first 1 A 100 NA NA NA 20 18 19
# X2007
#1 NA
#2 19
#3 NA
#4 NA
#5 NA
#6 15
#7 18
#8 21
#9 13
#10 18
#11 12
#12 19
#13 16
If I understand the problem: every itemcode in id=9 subset must have identical itemcode in id=7 subset (and reverse). If it is not the case then we filter the row with the non-pair itemcode out, but leave everything with id not in 7 or 9. Here is one way of doing it:
First get common item codes:
items_9 <- df_all$itemcode[ df_all$id==9 ]
items_7 <- df_all$itemcode[ df_all$id==7 ]
items_common <- items_9[ items_9 %in% items_7 ]
select everything with common itemcodes for 7 and 9 and the rest:
df_new <- df_all[
which(
( df_all$id %in% c(7, 9) &
df_all$itemcode %in% items_common
) |
!df_all$id %in% c(7,9)
)
,]
library(dplyr)
df$remove <- paste(df$itemcode, df$type)
df<-invisible(filter(df,
remove %in% intersect(remove[type==7],
remove[type==9])|!type %in% c(7,9) ))
#Remove the additional column after filter
df$remove <- NULL
You could do something like this, which runs setdiff in both directions. The cl() function wasn't really necessary, but I really don't like writing the same expression over and over again.
f <- function(x, y) setdiff(union(x, y), x)
cl <- function(var) substitute(df$itemcode[df$id == x], list(x = var))
So now you can call f() on c(id7, id9) and then reverse it and get the c(id9, id7) result.
do.call(f, x <- list(cl(7), cl(9)))
# [1] 2
do.call(f, rev(x))
# [1] 9