I have sales data by years and models. Here sales of J model in each year is missing. Now I want the following condition:
Fill NA of J model with a maximum value of sales in each year + 100. For instance, max sale in 2015 was 984, so J has to be 984+100 in 2015
df <- data.frame (model = c("A","B","C","D","E","F","G","H","I","J","A","B","C","D","E","F","G","H","I","J","A","B","C","D","E","F","G","H","I","J","A","B","C","D","E","F","G","H","I","J","A","B","C","D","E","F","G","H","I","J","A","B","C","D","E","F","G","H","I","J"),
Year = c(2015,2015,2015,2015,2015,2015,2015,2015,2015,2015,2016,2016,2016,2016,2016,2016,2016,2016,2016,2016,2017,2017,2017,2017,2017,2017,2017,2017,2017,2017,2018,2018,2018,2018,2018,2018,2018,2018,2018,2018,2019,2019,2019,2019,2019,2019,2019,2019,2019,2019,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020),
sales = c(450,678,456,344,984,456,234,244,655,"NA",234,567,234,567,232,900,1005,1900,450,"NA",567,235,456,345,144,333,555,777,111,"NA",222,223,445,776,331,788,980,1003,456,"NA",345,2222,3456,456,678,8911,4560,4567,4566,"NA",6666,7777,8888,1233,1255,5677,3411,2344,6122,"NA"))
You may try(NA is "NA" so it needed to be as numeric)
library(dplyr)
df %>%
group_by(Year) %>%
mutate(sales = as.numeric(sales)) %>%
mutate(sales = ifelse(is.na(sales) & (model == "J"), max(sales, na.rm = T) + 100, sales))
model Year sales
<chr> <dbl> <dbl>
1 A 2015 450
2 B 2015 678
3 C 2015 456
4 D 2015 344
5 E 2015 984
6 F 2015 456
7 G 2015 234
8 H 2015 244
9 I 2015 655
10 J 2015 1084
# … with 50 more rows
base R option:
df <- data.frame (model = c("A","B","C","D","E","F","G","H","I","J","A","B","C","D","E","F","G","H","I","J","A","B","C","D","E","F","G","H","I","J","A","B","C","D","E","F","G","H","I","J","A","B","C","D","E","F","G","H","I","J","A","B","C","D","E","F","G","H","I","J"),
Year = c(2015,2015,2015,2015,2015,2015,2015,2015,2015,2015,2016,2016,2016,2016,2016,2016,2016,2016,2016,2016,2017,2017,2017,2017,2017,2017,2017,2017,2017,2017,2018,2018,2018,2018,2018,2018,2018,2018,2018,2018,2019,2019,2019,2019,2019,2019,2019,2019,2019,2019,2020,2020,2020,2020,2020,2020,2020,2020,2020,2020),
sales = c(450,678,456,344,984,456,234,244,655,"NA",234,567,234,567,232,900,1005,1900,450,"NA",567,235,456,345,144,333,555,777,111,"NA",222,223,445,776,331,788,980,1003,456,"NA",345,2222,3456,456,678,8911,4560,4567,4566,"NA",6666,7777,8888,1233,1255,5677,3411,2344,6122,"NA"))
df$sales <- as.numeric(df$sales)
#> Warning: NAs introduced by coercion
df$sales <- with(df, ave(sales, Year, FUN = function(x) ifelse(is.na(x) & model == "J", max(x, na.rm = TRUE) + 100, sales)))
df
#> model Year sales
#> 1 A 2015 450
#> 2 B 2015 678
#> 3 C 2015 456
#> 4 D 2015 344
#> 5 E 2015 984
#> 6 F 2015 456
#> 7 G 2015 234
#> 8 H 2015 244
#> 9 I 2015 655
#> 10 J 2015 1084
#> 11 A 2016 450
#> 12 B 2016 678
#> 13 C 2016 456
#> 14 D 2016 344
#> 15 E 2016 984
#> 16 F 2016 456
#> 17 G 2016 234
#> 18 H 2016 244
#> 19 I 2016 655
#> 20 J 2016 2000
#> 21 A 2017 450
#> 22 B 2017 678
#> 23 C 2017 456
#> 24 D 2017 344
#> 25 E 2017 984
#> 26 F 2017 456
#> 27 G 2017 234
#> 28 H 2017 244
#> 29 I 2017 655
#> 30 J 2017 877
#> 31 A 2018 450
#> 32 B 2018 678
#> 33 C 2018 456
#> 34 D 2018 344
#> 35 E 2018 984
#> 36 F 2018 456
#> 37 G 2018 234
#> 38 H 2018 244
#> 39 I 2018 655
#> 40 J 2018 1103
#> 41 A 2019 450
#> 42 B 2019 678
#> 43 C 2019 456
#> 44 D 2019 344
#> 45 E 2019 984
#> 46 F 2019 456
#> 47 G 2019 234
#> 48 H 2019 244
#> 49 I 2019 655
#> 50 J 2019 9011
#> 51 A 2020 450
#> 52 B 2020 678
#> 53 C 2020 456
#> 54 D 2020 344
#> 55 E 2020 984
#> 56 F 2020 456
#> 57 G 2020 234
#> 58 H 2020 244
#> 59 I 2020 655
#> 60 J 2020 8988
Created on 2022-07-08 by the reprex package (v2.0.1)
Related
I have a dataset where I have been able to loop over different test values with dpois. For simplicity's sake, I have used an average of 4 events per month and I wanted to know what is the likelihood of n or more events, given the average. Here is what I have managed to make work:
MonthlyAverage <- 4
cnt <- c(0:10)
for (i in cnt) {
CountProb <- ppois(cnt,MonthlyAverage,lower.tail=FALSE)
}
dfProb <- data.frame(cnt,CountProb)
I am interested in investigating this to figure out how many events I may expect each month given the mean of that month.
I would be looking to say:
For January, what is the probability of 0
For January, what is the probability of 1
For January, what is the probability of 2
etc...
For February, what is the probability of 0
For February, what is the probability of 1
For February, what is the probability of 2
etc.
To give something like (numbers here are just an example):
I thought about trying one loop to select the correct month and then remove the month column so I am just left with the single "Monthly Average" value and then performing the count loop, but that doesn't seem to work. I still get "Non-numeric argument to mathematical function". I feel like I'm close, but can anyone please point me in the right direction for the formatting?
a "tidy-style" solution:
library(tidyr)
library(dplyr)
## example data:
df <- data.frame(Month = c('Jan', 'Feb'),
MonthlyAverage = c(5, 2)
)
> df
Month MonthlyAverage
1 Jan 5
2 Feb 2
df |>
mutate(n = list(1:10)) |>
unnest_longer(n) |>
mutate(CountProb = ppois(n, MonthlyAverage,
lower.tail=FALSE
)
)
# A tibble: 20 x 4
Month MonthlyAverage n CountProb
<chr> <dbl> <int> <dbl>
1 Jan 5 1 0.960
2 Jan 5 2 0.875
3 Jan 5 3 0.735
4 Jan 5 4 0.560
5 Jan 5 5 0.384
6 Jan 5 6 0.238
## ...
How about something like this:
cnt <- 0:10
MonthlyAverage <- c(1.8, 1.56, 2.44, 1.86, 2.1, 2.3, 2, 2.78, 1.89, 1.86, 1.4, 1.71)
grid <- expand.grid(cnt =cnt, m_num = 1:12)
grid$MonthlyAverage <- MonthlyAverage[grid$m_num]
mnames <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
grid$month <- mnames[grid$m_num]
grid$prob <- ppois(grid$cnt, grid$MonthlyAverage, lower.tail=FALSE)
grid[,c("month", "cnt", "prob")]
#> month cnt prob
#> 1 Jan 0 8.347011e-01
#> 2 Jan 1 5.371631e-01
#> 3 Jan 2 2.693789e-01
#> 4 Jan 3 1.087084e-01
#> 5 Jan 4 3.640666e-02
#> 6 Jan 5 1.037804e-02
#> 7 Jan 6 2.569450e-03
#> 8 Jan 7 5.615272e-04
#> 9 Jan 8 1.097446e-04
#> 10 Jan 9 1.938814e-05
#> 11 Jan 10 3.123964e-06
#> 12 Feb 0 7.898639e-01
#> 13 Feb 1 4.620517e-01
#> 14 Feb 2 2.063581e-01
#> 15 Feb 3 7.339743e-02
#> 16 Feb 4 2.154277e-02
#> 17 Feb 5 5.364120e-03
#> 18 Feb 6 1.157670e-03
#> 19 Feb 7 2.202330e-04
#> 20 Feb 8 3.743272e-05
#> 21 Feb 9 5.747339e-06
#> 22 Feb 10 8.044197e-07
#> 23 Mar 0 9.128391e-01
#> 24 Mar 1 7.001667e-01
#> 25 Mar 2 4.407062e-01
#> 26 Mar 3 2.296784e-01
#> 27 Mar 4 1.009515e-01
#> 28 Mar 5 3.813271e-02
#> 29 Mar 6 1.258642e-02
#> 30 Mar 7 3.681711e-03
#> 31 Mar 8 9.657751e-04
#> 32 Mar 9 2.294546e-04
#> 33 Mar 10 4.979244e-05
#> 34 Apr 0 8.443274e-01
#> 35 Apr 1 5.547763e-01
#> 36 Apr 2 2.854938e-01
#> 37 Apr 3 1.185386e-01
#> 38 Apr 4 4.090445e-02
#> 39 Apr 5 1.202455e-02
#> 40 Apr 6 3.071778e-03
#> 41 Apr 7 6.928993e-04
#> 42 Apr 8 1.398099e-04
#> 43 Apr 9 2.550478e-05
#> 44 Apr 10 4.244028e-06
#> 45 May 0 8.775436e-01
#> 46 May 1 6.203851e-01
#> 47 May 2 3.503686e-01
#> 48 May 3 1.613572e-01
#> 49 May 4 6.212612e-02
#> 50 May 5 2.044908e-02
#> 51 May 6 5.862118e-03
#> 52 May 7 1.486029e-03
#> 53 May 8 3.373058e-04
#> 54 May 9 6.927041e-05
#> 55 May 10 1.298297e-05
#> 56 Jun 0 8.997412e-01
#> 57 Jun 1 6.691458e-01
#> 58 Jun 2 4.039612e-01
#> 59 Jun 3 2.006529e-01
#> 60 Jun 4 8.375072e-02
#> 61 Jun 5 2.997569e-02
#> 62 Jun 6 9.361934e-03
#> 63 Jun 7 2.588841e-03
#> 64 Jun 8 6.415773e-04
#> 65 Jun 9 1.439431e-04
#> 66 Jun 10 2.948727e-05
#> 67 Jul 0 8.646647e-01
#> 68 Jul 1 5.939942e-01
#> 69 Jul 2 3.233236e-01
#> 70 Jul 3 1.428765e-01
#> 71 Jul 4 5.265302e-02
#> 72 Jul 5 1.656361e-02
#> 73 Jul 6 4.533806e-03
#> 74 Jul 7 1.096719e-03
#> 75 Jul 8 2.374473e-04
#> 76 Jul 9 4.649808e-05
#> 77 Jul 10 8.308224e-06
#> 78 Aug 0 9.379615e-01
#> 79 Aug 1 7.654944e-01
#> 80 Aug 2 5.257652e-01
#> 81 Aug 3 3.036162e-01
#> 82 Aug 4 1.492226e-01
#> 83 Aug 5 6.337975e-02
#> 84 Aug 6 2.360590e-02
#> 85 Aug 7 7.809999e-03
#> 86 Aug 8 2.320924e-03
#> 87 Aug 9 6.254093e-04
#> 88 Aug 10 1.540564e-04
#> 89 Sep 0 8.489282e-01
#> 90 Sep 1 5.634025e-01
#> 91 Sep 2 2.935807e-01
#> 92 Sep 3 1.235929e-01
#> 93 Sep 4 4.327373e-02
#> 94 Sep 5 1.291307e-02
#> 95 Sep 6 3.349459e-03
#> 96 Sep 7 7.672845e-04
#> 97 Sep 8 1.572459e-04
#> 98 Sep 9 2.913775e-05
#> 99 Sep 10 4.925312e-06
#> 100 Oct 0 8.443274e-01
#> 101 Oct 1 5.547763e-01
#> 102 Oct 2 2.854938e-01
#> 103 Oct 3 1.185386e-01
#> 104 Oct 4 4.090445e-02
#> 105 Oct 5 1.202455e-02
#> 106 Oct 6 3.071778e-03
#> 107 Oct 7 6.928993e-04
#> 108 Oct 8 1.398099e-04
#> 109 Oct 9 2.550478e-05
#> 110 Oct 10 4.244028e-06
#> 111 Nov 0 7.534030e-01
#> 112 Nov 1 4.081673e-01
#> 113 Nov 2 1.665023e-01
#> 114 Nov 3 5.372525e-02
#> 115 Nov 4 1.425330e-02
#> 116 Nov 5 3.201149e-03
#> 117 Nov 6 6.223149e-04
#> 118 Nov 7 1.065480e-04
#> 119 Nov 8 1.628881e-05
#> 120 Nov 9 2.248494e-06
#> 121 Nov 10 2.828495e-07
#> 122 Dec 0 8.191342e-01
#> 123 Dec 1 5.098537e-01
#> 124 Dec 2 2.454189e-01
#> 125 Dec 3 9.469102e-02
#> 126 Dec 4 3.025486e-02
#> 127 Dec 5 8.217692e-03
#> 128 Dec 6 1.937100e-03
#> 129 Dec 7 4.028407e-04
#> 130 Dec 8 7.489285e-05
#> 131 Dec 9 1.258275e-05
#> 132 Dec 10 1.927729e-06
Created on 2023-01-09 by the reprex package (v2.0.1)
If you have each month's mean, in base R you could easily use sapply to estimate the probability of obtaining values 0 to 10 using each month's mean value. Then you can simply combine it in a data frame:
# Data
df <- data.frame(month = month.name,
mean = c(1.8, 2.8, 1.7, 1.6, 1.8, 2,
2.3, 2.4, 2.1, 1.4, 1.9, 1.9))
probs <- sapply(1:12, function(x) ppois(0:10, df$mean[x], lower.tail = FALSE))
finaldata <- data.frame(month = rep(month.name, each = 11),
events = rep(0:10, times = 12),
prob = prob = as.vector(probs))
Output:
# month events prob
# 1 January 0 8.347011e-01
# 2 January 1 5.371631e-01
# 3 January 2 2.693789e-01
# 4 January 3 1.087084e-01
# 5 January 4 3.640666e-02
# 6 January 5 1.037804e-02
# 7 January 6 2.569450e-03
# 8 January 7 5.615272e-04
# 9 January 8 1.097446e-04
# 10 January 9 1.938814e-05
# 11 January 10 3.123964e-06
# 12 February 0 9.391899e-01
# 13 February 1 7.689218e-01
# 14 February 2 5.305463e-01
# 15 February 3 3.080626e-01
# ...
# 131 December 9 3.044317e-05
# 132 December 10 5.172695e-06
Below is the sample data. I know that I have to do a left join. The question is how to have it only return values that match (indcodelist = indcodelist2) but with the highest codetype value.
indcodelist <- c(110000,111000,112000,113000,114000,115000,121000,210000,211000,315000)
estemp <- c(11,21,31,41,51,61,55,21,22,874)
projemp <- c(15,25,36,45,52,61,31,29,31,899)
nchg <- c(4,4,5,4,1,0,-24,8,9,25)
firsttable <- data.frame(indcodelist,estemp,projemp,nchg)
indcodelist2 <- c(110000,111000,112000,113000,114000,115000,121000,210000,211000,315000,110000,111000,112000,113000)
codetype <- c(18,18,18,18,18,18,18,18,18,18,10,10,10,10)
codetitle <- c("Accountant","Doctor","Lawyer","Teacher","Economist","Financial Analyst","Meteorologist","Dentist", "Editor","Veterinarian","Accounting Technician","Doctor","Lawyer","Teacher")
secondtable <- data.frame(indcodelist2,codetype,codetitle)
tried <- left_join(firsttable,secondtable, by =c(indcodelist = "indcodelist2"))
Desired Result
indcodelist estemp projemp nchg codetitle
110000 11 15 4 Accountant
111000 21 25 4 Doctor
If you only want values that match in both tables, inner_join might be what you’re looking for. You can see this answer to understand different types of joins.
To get the highest codetype, you can use dplyr::slice_max(). Be aware the default behavior is to return values that tie. If there is more than one codetitle at the same codetype, they’ll all be returned.
library(tidyverse)
firsttable %>%
inner_join(., secondtable, by = c("indcodelist" = "indcodelist2")) %>%
group_by(indcodelist) %>%
slice_max(codetype)
#> # A tibble: 10 × 6
#> # Groups: indcodelist [10]
#> indcodelist estemp projemp nchg codetype codetitle
#> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 110000 11 15 4 18 Accountant
#> 2 111000 21 25 4 18 Doctor
#> 3 112000 31 36 5 18 Lawyer
#> 4 113000 41 45 4 18 Teacher
#> 5 114000 51 52 1 18 Economist
#> 6 115000 61 61 0 18 Financial Analyst
#> 7 121000 55 31 -24 18 Meteorologist
#> 8 210000 21 29 8 18 Dentist
#> 9 211000 22 31 9 18 Editor
#> 10 315000 874 899 25 18 Veterinarian
Created on 2022-09-15 by the reprex package (v2.0.1)
You might use {powerjoin} :
library(powerjoin)
power_inner_join(
firsttable,
secondtable |> summarize_by_keys(dplyr::across()[which.max(codetype),]),
by = c("indcodelist" = "indcodelist2")
)
#> indcodelist estemp projemp nchg codetype codetitle
#> 1 110000 11 15 4 18 Accountant
#> 2 111000 21 25 4 18 Doctor
#> 3 112000 31 36 5 18 Lawyer
#> 4 113000 41 45 4 18 Teacher
#> 5 114000 51 52 1 18 Economist
#> 6 115000 61 61 0 18 Financial Analyst
#> 7 121000 55 31 -24 18 Meteorologist
#> 8 210000 21 29 8 18 Dentist
#> 9 211000 22 31 9 18 Editor
#> 10 315000 874 899 25 18 Veterinarian
I have a data.frame (df) with two columns (Date & Count) which looks something like shown below:
Date Count
1/1/2022 5
1/2/2022 13
1/3/2022 21
1/4/2022 29
1/5/2022 37
1/6/2022 45
1/7/2022 53
1/8/2022 61
1/9/2022 69
1/10/2022 77
1/11/2022 85
1/12/2022 93
1/13/2022 101
1/14/2022 109
1/15/2022 117
Since I have single variable (count), the idea is to identify if there's been a change in mean in every three days, therefore I want to apply rolling t.test with a window of 3 days and save the resulting p-value next to Count column which I can plot later. Since I have seen people doing these sorts of tests with two variables usually, I can't figure out how to do it with a single variable.
For example, I saw this relevant answer here:
ttestFun <- function(dat) {
myTtest = t.test(x = dat[, 1], y = dat[, 2])
return(myTtest$p.value)
}
rollapply(df_ts, 7, FUN = ttestFun, fill = NA, by.column = FALSE)
But again, this is with two columns. Any guidance please?
Irrespective of any discussion about the usefulness of the approach, given a fixed number of measurements of 3, you could just shift the counts by 3 and perform t-test between two columns as in your example, such as:
library(data.table)
set.seed(123)
dates <- seq(as.POSIXct("2022-01-01"), as.POSIXct("2022-02-01"), by = "1 day")
dt <- data.table(Date=dates, count = sample(1:200, length(dates), replace=TRUE), key="Date")
dt[, nxt:=shift(count, 3, type = "lead")]
dt[, group:=rep(1:ceiling(length(dates)/3), each=3)[seq_along(dates)]]
dt[, p:= tryCatch(t.test(count, nxt)$p.value, error=function(e) NA), by="group"][]
#> Date count nxt group p
#> 1: 2022-01-01 159 195 1 0.7750944
#> 2: 2022-01-02 179 170 1 0.7750944
#> 3: 2022-01-03 14 50 1 0.7750944
#> 4: 2022-01-04 195 118 2 0.2240362
#> 5: 2022-01-05 170 43 2 0.2240362
#> 6: 2022-01-06 50 14 2 0.2240362
#> 7: 2022-01-07 118 118 3 0.1763296
#> 8: 2022-01-08 43 153 3 0.1763296
#> 9: 2022-01-09 14 90 3 0.1763296
#> 10: 2022-01-10 118 91 4 0.8896343
#> 11: 2022-01-11 153 197 4 0.8896343
#> 12: 2022-01-12 90 91 4 0.8896343
#> 13: 2022-01-13 91 185 5 0.8065021
#> 14: 2022-01-14 197 92 5 0.8065021
#> 15: 2022-01-15 91 137 5 0.8065021
#> 16: 2022-01-16 185 99 6 0.1060465
#> 17: 2022-01-17 92 72 6 0.1060465
#> 18: 2022-01-18 137 26 6 0.1060465
#> 19: 2022-01-19 99 7 7 0.5283156
#> 20: 2022-01-20 72 170 7 0.5283156
#> 21: 2022-01-21 26 137 7 0.5283156
#> 22: 2022-01-22 7 164 8 0.9612965
#> 23: 2022-01-23 170 78 8 0.9612965
#> 24: 2022-01-24 137 81 8 0.9612965
#> 25: 2022-01-25 164 43 9 0.6111337
#> 26: 2022-01-26 78 103 9 0.6111337
#> 27: 2022-01-27 81 117 9 0.6111337
#> 28: 2022-01-28 43 76 10 0.6453494
#> 29: 2022-01-29 103 143 10 0.6453494
#> 30: 2022-01-30 117 NA 10 0.6453494
#> 31: 2022-01-31 76 NA 11 NA
#> 32: 2022-02-01 143 NA 11 NA
#> Date count nxt group p
Created on 2022-04-07 by the reprex package (v2.0.1)
You could further clean that up, e.g. by taking the first date per group:
dt[, .(Date=Date[1], count=round(mean(count), 2), p=p[1]), by="group"]
#> group Date count p
#> 1: 1 2022-01-01 117.33 0.7750944
#> 2: 2 2022-01-04 138.33 0.2240362
#> 3: 3 2022-01-07 58.33 0.1763296
#> 4: 4 2022-01-10 120.33 0.8896343
#> 5: 5 2022-01-13 126.33 0.8065021
#> 6: 6 2022-01-16 138.00 0.1060465
#> 7: 7 2022-01-19 65.67 0.5283156
#> 8: 8 2022-01-22 104.67 0.9612965
#> 9: 9 2022-01-25 107.67 0.6111337
#> 10: 10 2022-01-28 87.67 0.6453494
#> 11: 11 2022-01-31 109.50 NA
You can create a grp, and then simply apply a t.test to each consecutive pair of groups:
d <- d %>% mutate(grp=rep(1:(n()/3), each=3))
d %>% left_join(
tibble(grp = 2:max(d$grp),
pval = sapply(2:max(d$grp), function(x) {
t.test(d %>% filter(grp==x) %>% pull(Count),
d %>% filter(grp==x-1) %>% pull(Count))$p.value
})
)) %>% group_by(grp) %>% slice_min(Date)
Output: (p-value is constant only because of the example data you provided)
Date Count grp pval
<date> <dbl> <int> <dbl>
1 2022-01-01 5 1 NA
2 2022-01-04 29 2 0.0213
3 2022-01-07 53 3 0.0213
4 2022-01-10 77 4 0.0213
5 2022-01-13 101 5 0.0213
Or a data.table approach:
setDT(d)[, `:=`(grp=rep(1:(nrow(d)/3), each=3),cy=shift(Count,3))] %>%
.[!is.na(cy), pval:=t.test(Count,cy)$p.value, by=grp] %>%
.[,.SD[1], by=grp, .SDcols=!c("cy")]
Output:
grp Date Count pval
<int> <Date> <num> <num>
1: 1 2022-01-01 5 NA
2: 2 2022-01-04 29 0.02131164
3: 3 2022-01-07 53 0.02131164
4: 4 2022-01-10 77 0.02131164
5: 5 2022-01-13 101 0.02131164
Let's assume we somehow ended up with data frame object (T2 in below example) and we want to subset our original data with that dataframe. Is there a way to do without using | in subset object?
Here is a dataset I was playing but failed
education = read.csv("https://vincentarelbundock.github.io/Rdatasets/csv/robustbase/education.csv", stringsAsFactors = FALSE)
colnames(education) = c("X", "State", "Region", "Urban.Population", "Per.Capita.Income", "Minor.Population", "Education.Expenditures")
head(education)
T1 = c(1,4,13,15,17,23,33,38)
T2 = education[T1,]$State
subset(education, State=="ME"| State=="MA" | State=="MI" | State=="MN" | State=="MO" | State=="MD" | State=="MS" | State=="MT")
subset(education, State==T2[3])
subset(education, State==T2)
PS: I created T2 as states starting with M but I don't want using string or anything. Just assume we somehow ended up with T2 in which outputs are some states.
I'm not quite sure what would be an acceptable answer but subset(education, State %in% T2) uses T2 as is and does not use |. Does this solve your problem? It's almost the same approach as Jon Spring points out in the comments, but instead of specifying a vector we can just use T2 with %in%. You say T2 is a data.frame object, but in the data you provided it turns out to be a character vector.
education = read.csv("https://vincentarelbundock.github.io/Rdatasets/csv/robustbase/education.csv", stringsAsFactors = FALSE)
colnames(education) = c("X", "State", "Region", "Urban.Population", "Per.Capita.Income", "Minor.Population", "Education.Expenditures")
T1 = c(1,4,13,15,17,23,33,38)
T2 = education[T1,]$State
T2 # T2 is not a data.frame object (R 4.0)
#> [1] "ME" "MA" "MI" "MN" "MO" "MD" "MS" "MT"
subset(education, State %in% T2)
#> X State Region Urban.Population Per.Capita.Income Minor.Population
#> 1 1 ME 1 508 3944 325
#> 4 4 MA 1 846 5233 305
#> 13 13 MI 2 738 5439 337
#> 15 15 MN 2 664 4921 330
#> 17 17 MO 2 701 4672 309
#> 23 23 MD 3 766 5331 323
#> 33 33 MS 3 445 3448 358
#> 38 38 MT 4 534 4418 335
#> Education.Expenditures
#> 1 235
#> 4 261
#> 13 379
#> 15 378
#> 17 231
#> 23 330
#> 33 215
#> 38 302
But lets say T2 would be an actual data.frame:
T2 = education[T1,]["State"]
T2 #check
#> State
#> 1 ME
#> 4 MA
#> 13 MI
#> 15 MN
#> 17 MO
#> 23 MD
#> 33 MS
#> 38 MT
Then we could coerce it into a vector by subsetting it with drop = TRUE.
subset(education, State %in% T2[, , drop = TRUE])
#> X State Region Urban.Population Per.Capita.Income Minor.Population
#> 1 1 ME 1 508 3944 325
#> 4 4 MA 1 846 5233 305
#> 13 13 MI 2 738 5439 337
#> 15 15 MN 2 664 4921 330
#> 17 17 MO 2 701 4672 309
#> 23 23 MD 3 766 5331 323
#> 33 33 MS 3 445 3448 358
#> 38 38 MT 4 534 4418 335
#> Education.Expenditures
#> 1 235
#> 4 261
#> 13 379
#> 15 378
#> 17 231
#> 23 330
#> 33 215
#> 38 302
Created on 2021-06-12 by the reprex package (v0.3.0)
Been trying to learn the most basic of items at first and then expanding the complexity. So for this one, how would I modify the last line to where it would be create a rolling 12 month average for each seriescode. In this case, it would produce an average of 8 for seriescode 100 and 27 for seriescode 101.
First, is the sample data
Monthx<- c(201911,201912,20201
,20202,20203,20204,20205,20206,20207
,20208,20209,202010,202011,201911,201912,20201
,20202,20203,20204,20205,20206,20207
,20208,20209,202010,202011)
empx <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,21,22,23,24,25,26,27,28,29,20,31,32,33)
seriescode<-c(100,100,100,100,100,100,100,100,100,100,100,100,100,110,110,110,110,110,110,110,110,110,110,110,110,110)
ces12x <- data.frame(Monthx,empx,seriescode)
Manipulations
library(dplyr)
ces12x<- ces12x %>% mutate(year = substr(as.numeric(Monthx),1,4),
month = substr(as.numeric(Monthx),5,7),
date = as.Date(paste(year,month,"1",sep ="-")))
Month_ord <- order(Monthx)
ces12x<-ces12x %>% mutate(ravg = zoo::rollmeanr(empx, 12, fill = NA))
You would just need to add a group_by(seriescode) which would then perform the mutate functions per seriescode:
Monthx<- c(201911,201912,20201
,20202,20203,20204,20205,20206,20207
,20208,20209,202010,202011,201911,201912,20201
,20202,20203,20204,20205,20206,20207
,20208,20209,202010,202011)
empx <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,21,22,23,24,25,26,27,28,29,20,31,32,33)
seriescode<-c(100,100,100,100,100,100,100,100,100,100,100,100,100,110,110,110,110,110,110,110,110,110,110,110,110,110)
ces12x <- data.frame(Monthx,empx,seriescode)
ces12x<- ces12x %>% mutate(year = substr(as.numeric(Monthx),1,4),
month = substr(as.numeric(Monthx),5,7),
date = as.Date(paste(year,month,"1",sep ="-")))
Month_ord <- order(Monthx)
ces12x<-ces12x %>% group_by(seriescode) %>% mutate(ravg = zoo::rollmeanr(empx, 12, fill = NA)) # add the group_by(seriescode)
This produces the output:
# A tibble: 26 x 7
# Groups: seriescode [2]
Monthx empx seriescode year month date ravg
<dbl> <dbl> <dbl> <chr> <chr> <date> <dbl>
1 201911 1 100 2019 11 2019-11-01 NA
2 201912 2 100 2019 12 2019-12-01 NA
3 20201 3 100 2020 1 2020-01-01 NA
4 20202 4 100 2020 2 2020-02-01 NA
5 20203 5 100 2020 3 2020-03-01 NA
6 20204 6 100 2020 4 2020-04-01 NA
7 20205 7 100 2020 5 2020-05-01 NA
8 20206 8 100 2020 6 2020-06-01 NA
9 20207 9 100 2020 7 2020-07-01 NA
10 20208 10 100 2020 8 2020-08-01 NA
11 20209 11 100 2020 9 2020-09-01 NA
12 202010 12 100 2020 10 2020-10-01 6.5
13 202011 13 100 2020 11 2020-11-01 7.5
14 201911 21 110 2019 11 2019-11-01 NA
15 201912 22 110 2019 12 2019-12-01 NA
16 20201 23 110 2020 1 2020-01-01 NA
17 20202 24 110 2020 2 2020-02-01 NA
18 20203 25 110 2020 3 2020-03-01 NA
19 20204 26 110 2020 4 2020-04-01 NA
20 20205 27 110 2020 5 2020-05-01 NA
21 20206 28 110 2020 6 2020-06-01 NA
22 20207 29 110 2020 7 2020-07-01 NA
23 20208 20 110 2020 8 2020-08-01 NA
24 20209 31 110 2020 9 2020-09-01 NA
25 202010 32 110 2020 10 2020-10-01 25.7
26 202011 33 110 2020 11 2020-11-01 26.7
If you want to continue using the tidyverse for this, the following should do the trick:
library(dplyr)
ces12x %>%
group_by(seriescode) %>%
arrange(date) %>%
slice(tail(row_number(), 12)) %>%
summarize(ravg = mean(empx))