I would like to calculate the mean of every SACCADIC_RT for which COMMISSION_ERROR =1, that follows every 5 consecutive HITS=1, per ID per condition.
ID | TRIAL | TRIAL_TYPE| CONDITION | COMMISSION_ERROR | HITS| SACCADIC_RT
1 183 nogo square_1 1 -1 175
1 54 go square_1 -1 1 259
1 26 nogo square_1 1 -1 365
1 3 nogo square_1 1 -1 346
1 100 nogo square_1 1 -1 287
1 11 go square_1 -1 1 164
1 52 go square_1 -1 1 244
1 8 go square_1 -1 1 223
1 10 go square_1 -1 1 183
1 21 go square_1 -1 1 234
1 32 go square_1 1 -1 221
1 2 go square_1 -1 1 183
1 13 nogo square_1 0 -1 -1
1 87 nogo square_2 1 -1 228
1 95 nogo square_2 1 -1 274
1 111 go square_2 -1 1 305
1 28 nogo square_2 0 -1 309
1 65 go square_2 -1 0 -1
1 40 nogo square_1 0 -1 199
1 19 nogo square_1 0 -1 207
1 28 go square_1 -1 1 257
2 45 nogo square_1 1 -1 169
2 197 nogo square_1 1 -1 350
2 115 nogo square_1 1 -1 321
2 65 go square_2 -1 1 298
2 24 go square_2 -1 0 -1
2 1 nogo square_2 1 -1 183
2 77 go square_2 -1 1 225
2 90 go square_2 -1 1 305
2 89 go square_2 -1 1 210
2 104 go square_2 -1 1 199
2 116 go square_2 -1 1 175
2 29 nogo square_2 1 -1 99
2 41 go square_2 -1 1 104
The sample table can be recreated in r as:
structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), X..TRIAL.. = c(183L, 54L, 26L,
3L, 100L, 11L, 52L, 8L, 10L, 21L, 32L, 2L, 13L, 87L, 95L, 111L,
28L, 65L, 40L, 19L, 28L, 45L, 197L, 115L, 65L, 24L, 1L, 77L,
90L, 89L, 104L, 116L, 29L, 41L), TRIAL_TYPE. = structure(c(2L,
1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L,
1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L,
1L), .Label = c("go", "nogo"), class = "factor"), CONDITION = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("square_1", "square_2"), class = "factor"), X..COMMISSION_ERROR = c(1L,
-1L, 1L, 1L, 1L, -1L, -1L, -1L, -1L, -1L, 1L, -1L, 0L, 1L, 1L,
-1L, 0L, -1L, 0L, 0L, -1L, 1L, 1L, 1L, -1L, -1L, 1L, -1L, -1L,
-1L, -1L, -1L, 1L, -1L), X..HITS. = c(-1L, 1L, -1L, -1L, -1L,
1L, 1L, 1L, 1L, 1L, -1L, 1L, -1L, -1L, -1L, 1L, -1L, 0L, -1L,
-1L, 1L, -1L, -1L, -1L, 1L, 0L, -1L, 1L, 1L, 1L, 1L, 1L, -1L,
1L), SACCADIC_RT = c(175L, 259L, 365L, 346L, 287L, 164L, 244L,
223L, 183L, 234L, 221L, 183L, -1L, 228L, 274L, 305L, 309L, -1L,
199L, 207L, 257L, 169L, 350L, 321L, 298L, -1L, 183L, 225L, 305L,
210L, 199L, 175L, 99L, 104L)), .Names = c("ID", "X..TRIAL..",
"TRIAL_TYPE.", "CONDITION", "X..COMMISSION_ERROR", "X..HITS.",
"SACCADIC_RT"), class = "data.frame", row.names = c(NA, -34L))
So the result from this example will be like:
ID | CONDITION | x
1 square_1 221
2 square_2 99
You can use the package data.table to perform this task.
The steps are then as follows:
1) for each ID and condition calculate the rolling sum of hits
2) take only rows which satisfy 2 conditions: commision_error = 1 on previous row, there is number 5 in rolling sum column
3) calculate mean for each ID and condition in the table created in step 2
# load your data
data <- read.csv("./yourData.csv")
# load data table library
library(data.table)
# convert your data to data.table object
data <- data.table(data)
# group data by ID and Condition, calculate rolling sum over 5 rows
data[, roll := Reduce('+', shift(HITS, 0:4)), by = list(ID, CONDITION)]
# take only rows where there were 5 hits in a row and commission error is 1
newData <- data[shift(roll, 1) == 5 & COMMISSION_ERROR == 1]
# calculate mean of SACCADIC_RT for each ID and Condition in the new dataset
newData[, meanSacc := mean(SACCADIC_RT), by = list(ID, CONDITION)]
Code Explained:
roll := Reduce('+', shift(HITS, 0:4))
The shift function allows you to calculate value in current row based on the value in some previous row. Here the 'Reduce('+', shift(HITS, 0:4)) 'take value of hits on a given row and add to that the value of hits on 3 previous rows. This value is then written to the new column called roll.
newData <- data[shift(roll, 1) == 5 & COMMISSION_ERROR == 1]
The above code keeps only rows from the original dataset, where there is value 5 in the previous row of the column roll and the value of COMISSION_ERROR on the current line is equal to 1.
newData[, meanSacc := mean(SACCADIC_RT), by = list(ID, CONDITION)]
The above snip calculate mean of SACCADIC_RT for each ID and CONDITION and the mean is calculated only from rows in the new dataset created above. The means are then written to a new value called meanSacc
Related
My data looks like this:
structure(list(did = c(209L, 209L, 206L, 206L, 206L, 206L, 206L,
206L, 206L, 206L, 206L, 209L, 206L, 206L, 207L, 207L, 207L, 207L,
209L, 209L), hhid = c(5668, 5595, 4724, 4756, 4856, 4730, 4757,
6320, 4758, 6319, 6311, 5477, 6322, 6317, 134, 178, 238, 179,
5865, 5875), bc = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L,
1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L), rc = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
oap = c(2L, 2L, 0L, 0L, 0L, 0L, 0L, 2L, 0L, 2L, 2L, 0L, 2L,
2L, 2L, 2L, 2L, 2L, 0L, 0L)), row.names = c(NA, 20L), class = "data.frame")
hhid is unique for each row. For the remaining rows it consist of 0s and 1s in some columns and 0s 1s and 2s in other columns.
The output column required is like this:
did hh_count bc_0 bc_1 bc_2 rc_0 rc_1 rc_2 oap_0 oap_1 oap_2
where did will be unique.hh_count will be count of each hhid associated with did.
bc_0, bc_1 and bc_1 will be breakup of column bc and it will represent count of 0s 1s and 2s in bc.Simmilarily for rc_0,rc_1and rc_2 and oap_0,oap_1 and oap_2.So counting of 0s 1s and 2s is required
With counts of 3 specific values, writing the functions manually seems reasonable. If you need specific counts of more distinct values we could come up with a better way to generalize - probably converting your data to long format, summarizing, and then going back to wide.
library(dplyr) # across() requires dplyr version 1.0 or higher
dd %>% # (calling your data dd)
group_by(did) %>%
summarize(
hh_count = n_distinct(hhid),
across(c(bc, rc, oap),
.fns = list("0" = ~sum(. == 0), "1" = ~sum(. == 1), "2" = ~sum(. == 2)),
.names = "{.col}_{.fn}" # this is the default, but I show it explicitly
)
)
# # A tibble: 3 x 11
# did hh_count bc_0 bc_1 bc_2 rc_0 rc_1 rc_2 oap_0 oap_1 oap_2
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1 206 11 2 9 0 2 9 0 6 0 5
# 2 207 4 0 4 0 0 4 0 0 0 4
# 3 209 5 2 3 0 0 5 0 3 0 2
I have the following dataset
mydata=structure(list(id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), ad_id = c(111L, 111L, 111L,
111L, 1111L, 1111L, 11111L, 11111L, 11111L, 111L, 111L, 1111L,
1111L, 11111L, 11111L, 11111L, 111111L, 111111L), price = c(1L,
0L, 1L, 0L, 2L, 0L, 3L, 0L, 0L, 1L, 0L, 2L, 0L, 3L, 0L, 0L, 1L,
0L), rev = c(2L, 0L, 0L, 2L, 3L, 3L, 4L, 4L, 4L, 2L, 2L, 3L,
3L, 4L, 4L, 4L, 0L, 0L), data = structure(c(1L, 2L, 2L, 3L, 1L,
3L, 1L, 3L, 4L, 1L, 3L, 1L, 3L, 1L, 3L, 4L, 1L, 3L), .Label = c("01.01.2018",
"01.02.2018", "01.03.2018", "02.03.2018"), class = "factor")), .Names = c("id",
"ad_id", "price", "rev", "data"), class = "data.frame", row.names = c(NA,
-18L))
How can I create a dummy variable according to the following logic:
For each id and ad_id I need to aggregate by data price and rev. Each ad_id has a date column (data).
If for each id and ad_idfor the period up to 90 days(data column -d-m-y) rev is greater than the price, then the flag is set to 1 otherwise the flag is 0.
In this reproducible example , I just take 1 id and 4 ad_id.
In aggregated by sum form it is view
id ad_id price rev
1 1 111 2 4
2 1 1111 2 6
3 1 11111 3 12
4 1 111111 1 0
So for id=1 , all ad_id (besides ad_id = 111111) satisfy rev > price, so in initial data
ad_id = 111, 1111, 111111 must have flag = 1 and 111111 must have flag = 0.
Here is the desired output:
id ad_id price rev data flag
1 1 111 1 2 01.01.2018 1
2 1 111 0 0 01.02.2018 1
3 1 111 1 0 01.02.2018 1
4 1 111 0 2 01.03.2018 1
5 1 1111 2 3 01.01.2018 1
6 1 1111 0 3 01.03.2018 1
7 1 11111 3 4 01.01.2018 1
8 1 11111 0 4 01.03.2018 1
9 1 11111 0 4 02.03.2018 1
10 1 111111 1 0 01.01.2018 0
11 1 111111 0 0 01.03.2018 0
How to perform such condition
I am not sure if understood you correctly, but is this what you are looking for:
library(tidyverse)
mydata %>% as_tibble() %>%
group_by(id, ad_id) %>%
summarise_at(vars("price", "rev"), sum) %>%
mutate(flag = if_else(price > rev, 0, 1)) %>%
select(id, ad_id, flag) %>%
left_join(mydata, ., by = c("id", "ad_id"))
I have a monthly time series - monthlyTs:
monthlyTs <- ts(all.xts , frequency = 12, start=decimal_date(ymd("2012-01-29")))
head(index(monthlyTs))
1 "2012-01-29 00:00:00 UTC" "2012-02-26 01:22:47 UTC" "2012-03-25
02:45:35 UTC" "2012-04-29 04:29:04 UTC"
[5] "2012-05-27 05:51:52 UTC" "2012-06-24 07:14:39 UTC"
I want to apply a time windows that starts from 2013:
head(window(monthly, start = 2013))
2012-01-29 00:00:00 2
2012-02-26 01:22:47 8 2012-03-25 02:45:35 6 2012-04-29 04:29:04
5 2012-05-27 05:51:52 4 2012-06-24 07:14:39 4
So looks like window function is not filtering as expected. What is wrong?
Fully reproducible example as requested:
christmas.csv - tiny CSV file (google trends for 'Christmas' request)
#Reading data from the csv. Format - [week start date], [views per week]
data = read.csv('christmas.csv', sep=",", header = FALSE, skip = 3,col.names = c("Week","Views"))[[2]]
# creating time series
myTs <- ts(data[[2]], freq=365.25/7, start=decimal_date(ymd("2012-01-29")))
#converting from weekly to month time series
all.xts <- xts(myTs, date_decimal(index(myTs)))
monthlyTs <- ts(all.xts , frequency = 12, start=decimal_date(ymd("2012-01-29")))
head(window(monthlyTs, start = 2013))
2012-01-29 00:00:00 2
2012-02-26 01:22:47 8 2012-03-25 02:45:35 6 2012-04-29 04:29:04 5
2012-05-27 05:51:52 4 2012-06-24 07:14:39 4
There are two problems :
the object all.xts is a weekly and not a monthly time
The value your pass for the argument frequency is not correct
For the second point, try to change the value you pass for the argument start in your call of the function ts with
c(lubridate::year("2012-01-29"), lubridate::month("2012-01-29"))
and change the frequency to value 12. i.e use the line :
ts(all.xts , frequency = 12, start = c(lubridate::year("2012-01-29"), lubridate::month("2012-01-29")) )
Using the output from dput, your code rewrite as follow :
data <- c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 4L, 5L, 5L, 6L, 8L, 11L, 16L, 22L, 33L, 42L,
45L, 55L, 64L, 8L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 6L, 8L,
12L, 16L, 21L, 27L, 43L, 47L, 56L, 79L, 10L, 5L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 4L, 5L, 5L, 6L, 8L, 12L, 17L, 21L, 27L, 43L, 47L, 53L,
87L, 12L, 5L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 5L, 6L, 6L, 8L, 13L,
17L, 20L, 27L, 44L, 50L, 54L, 100L, 15L, 6L, 3L, 2L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 4L, 5L, 5L, 6L, 8L, 11L, 16L, 21L, 29L, 43L, 48L, 53L, 80L,
46L, 8L, 3L, 2L)
myTs <- ts(data, freq=365.25/7, start=decimal_date(ymd("2012-01-29")))
all.xts <- xts::xts(myTs, date_decimal(index(myTs)))
monthlyTs <- ts(all.xts , frequency = 12, start = c(lubridate::year("2012-01-29"), lubridate::month("2012-01-29")) )
window(monthlyTs, start= c(2013))
The last line will print :
> window(monthlyTs, start= c(2013))
Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
2013 1 1 1 1 1 1 1 1 1 1 1 1
2014 1 1 1 1 2 2 2 2 3 3 3 4
2015 5 5 6 8 11 16 22 33 42 45 55 64
2016 8 4 2 2 2 2 2 2 1 1 1 1
2017 1 1 1 1 1 1 1 1 1 1 1 1
2018 1 1 1 1 1 1 1 2 2 2 2 2
2019 3 3 3 4 4 5 6 8 12 16 21 27
2020 43 47 56 79 10 5 2 2 2 1 1 1
2021 1 1 1 1 1 1 1 1 1 1 1 1
2022 1 1 1 1 1 1 1 1 1 1 2 2
2023 2 2 2 2 3 3 3 4 5 5 6 8
2024 12 17 21 27 43 47 53 87 12 5 2 2
2025 2 1 1 1 1 1 1 1 1 1 1 1
2026 1 1 1 1 1 1 1 1 1 1 1 1
2027 1 2 2 2 2 2 2 2 3 3 3 4
2028 5 6 6 8 13 17 20 27 44 50 54 100
2029 15 6 3 2 2 1 1 1 1 1 1 1
2030 1 1 1 1 1 1 1 1 1 1 1 1
2031 1 1 1 1 1 1 2 2 2 2 2 2
2032 3 3 3 4 5 5 6 8 11 16 21 29
2033 43 48 53 80 46 8 3 2
With the following dataframe I need to obtain monthly sums of the following two variables: "CallsHandled" and "Engaged"
By the following grouping variables: "Month","ID","Location","LANGUAGE","MemRegion"
structure(list(Month = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Week = c(1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L), ID = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("A1234",
"F1234"), class = "factor"), Location = structure(c(2L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L), .Label = c("Corona", "Denver"), class = "factor"), LANGUAGE = structure(c(1L,
2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L,
2L, 2L, 1L), .Label = c("English", "Spanish"), class = "factor"),
MemRegion = structure(c(1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("NCAL",
"SCAL"), class = "factor"), CallsHandled = c(1L, 1L, 8L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 10L, 1L, 3L, 1L, 8L, 1L, 6L,
1L, 1L, 2L), Engaged = c(120L, 30L, 1243L, 75L, 45L, 55L,
200L, 120L, 30L, 230L, 2065L, 45L, 55L, 200L, 1483L, 30L,
1243L, 75L, 45L, 55L), QueueA = c(0L, 0L, 0L, 1L, 1L, 0L,
0L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L),
QueueB = c(1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 1L, 0L, 1L, 0L, 1L), QueueC = c(0L, 1L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L,
0L)), .Names = c("Month", "Week", "ID", "Location", "LANGUAGE",
"MemRegion", "CallsHandled", "Engaged", "QueueA", "QueueB", "QueueC"
), class = "data.frame", row.names = c(NA, -20L))
Additionally, in order to include "Queues A:C" as grouping variables, would I have to combine them into a single column? If so, how?
So there are 2 parts to this question, firstly how do you group things up and sum, and secondly how could you combine Queue A:C into one column.
For the first question you can use the library dplyr which makes it a lot easier and more intuitive.
df <- structure(list(Month = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
Week = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L),
ID = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("A1234", "F1234"), class = "factor"),
Location = structure(c(2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L), .Label = c("Corona", "Denver"), class = "factor"),
LANGUAGE = structure(c(1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L), .Label = c("English", "Spanish"), class = "factor"),
MemRegion = structure(c(1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("NCAL", "SCAL"), class = "factor"),
CallsHandled = c(1L, 1L, 8L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 10L, 1L, 3L, 1L, 8L, 1L, 6L, 1L, 1L, 2L),
Engaged = c(120L, 30L, 1243L, 75L, 45L, 55L, 200L, 120L, 30L, 230L, 2065L, 45L, 55L, 200L, 1483L, 30L, 1243L, 75L, 45L, 55L),
QueueA = c(0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L),
QueueB = c(1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 1L),
QueueC = c(0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L)), .Names = c("Month", "Week", "ID", "Location", "LANGUAGE", "MemRegion", "CallsHandled", "Engaged", "QueueA", "QueueB", "QueueC"),
class = "data.frame", row.names = c(NA, -20L))
library(dplyr)
df %>% group_by(Month, ID, Location, LANGUAGE) %>%
mutate(TotalCallsHandled = sum(CallsHandled),
TotalEngaged = sum(Engaged))
So firstly, we use group_by to group by specifically the variables you have listed, and mutate to sum everything up, and this will do what you think intuitively.
For combining everything into one column, there are probably many ways to do this, but probably the most straight forward way is to create some kind of unique identifier for each column and combine all the columns into one.
df$Queue <- as.factor(df$QueueA + df$QueueB*2 + df$QueueC*3)
levels(df$Queue) <- c("A", "B", "C")
Since everything should be a 0, 1 flag, we can recreate the flags to be 1 -> A, 2 -> B, 3 -> C, and then relevel the factors to be A, B, C again. Then we can simply use group_by function again to get the intended result as above.
df %>% group_by(Month, ID, Location, LANGUAGE, Queue) %>%
mutate(TotalCallsHandled = sum(CallsHandled),
TotalEngaged = sum(Engaged)) %>%
select(-QueueA, -QueueB, -QueueC)
With output:
Source: local data frame [20 x 11]
Groups: Month, ID, Location, LANGUAGE, Queue
Month Week ID Location LANGUAGE MemRegion CallsHandled Engaged Queue
1 1 1 F1234 Denver English NCAL 1 120 B
2 1 2 F1234 Corona Spanish SCAL 1 30 C
3 1 3 F1234 Corona English NCAL 8 1243 B
4 1 4 F1234 Corona Spanish NCAL 1 75 A
5 1 5 F1234 Corona Spanish SCAL 1 45 A
6 1 6 F1234 Denver English SCAL 2 55 B
7 1 7 F1234 Corona English NCAL 1 200 C
8 1 8 F1234 Corona English NCAL 1 120 B
9 1 9 F1234 Denver English NCAL 1 30 A
10 1 10 F1234 Corona Spanish NCAL 1 230 C
11 1 1 A1234 Corona English NCAL 10 2065 C
12 1 2 A1234 Corona English SCAL 1 45 A
13 1 3 A1234 Corona Spanish NCAL 3 55 A
14 1 4 A1234 Corona English NCAL 1 200 A
15 1 5 A1234 Corona English SCAL 8 1483 B
16 1 6 A1234 Denver English SCAL 1 30 B
17 1 7 A1234 Corona Spanish SCAL 6 1243 C
18 1 8 A1234 Corona Spanish SCAL 1 75 B
19 1 9 A1234 Corona Spanish SCAL 1 45 C
20 1 10 A1234 Corona English SCAL 2 55 B
Variables not shown: TotalCallsHandled (int), TotalEngaged (int)
To make the Queue variables into a single factor variable, you could do this:
queues <- which(dat[ , c("QueueA", "QueueB", "QueueC")]==1, arr.ind=TRUE)
queues<-queues[
order(queues[,"row"]), "col"]
queues<-factor(queues, labels=c("QueueA", "QueueB", "QueueC"))
dat <- data.frame(dat, queues)
Though, #chappers approach for this is nicer.
Then, you can use aggregate:
aggregate(dat[,c("CallsHandled", "Engaged")],
by=list(dat$Month, dat$ID, dat$Location, dat$LANGUAGE, dat$MemRegion, dat$queues),
sum)
# Group.1 Group.2 Group.3 Group.4 Group.5 Group.6 CallsHandled Engaged
#1 1 A1234 Corona English NCAL QueueA 1 200
#2 1 F1234 Denver English NCAL QueueA 1 30
#3 1 A1234 Corona Spanish NCAL QueueA 3 55
#4 1 F1234 Corona Spanish NCAL QueueA 1 75
#5 1 A1234 Corona English SCAL QueueA 1 45
#6 1 F1234 Corona Spanish SCAL QueueA 1 45
#7 1 F1234 Corona English NCAL QueueB 9 1363
#8 1 F1234 Denver English NCAL QueueB 1 120
#9 1 A1234 Corona English SCAL QueueB 10 1538
#10 1 A1234 Denver English SCAL QueueB 1 30
#11 1 F1234 Denver English SCAL QueueB 2 55
#12 1 A1234 Corona Spanish SCAL QueueB 1 75
#13 1 A1234 Corona English NCAL QueueC 10 2065
#14 1 F1234 Corona English NCAL QueueC 1 200
#15 1 F1234 Corona Spanish NCAL QueueC 1 230
#16 1 A1234 Corona Spanish SCAL QueueC 7 1288
#17 1 F1234 Corona Spanish SCAL QueueC 1 30
#chappers solution aggregates correctly but leaves me with a bunch of duplicate rows for some reason that I can't figure out. This works for factors and reduces the number of rows in my actual dataframe (no duplicates):
aggregate(cbind(CallsHandled,Engaged~Month + ID + Location + LANGUAGE + MemRegion, data=df, sum, na.rm=TRUE)
I've got a simple dataset.
structure(list(ID = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 4L, 4L,
4L, 5L, 5L), Primrely = c(0L, 2L, 1L, 1L, 1L, 1L, 3L, 4L, 4L,
3L, 1L, 2L, 2L), Primset = c(-4L, -3L, 1L, 2L, -4L, 5L, 3L, 1L,
2L, -4L, -2L, -3L, 3L), Primvalue = c(45L, 5L, 6L, 15L, 53L,
45L, 44L, 65L, 1L, 5L, 1L, 12L, 5L), Secrely = c(5L, 7L, 2L,
1L, 2L, 0L, 4L, 5L, 1L, 1L, 1L, 0L, 2L), Secset = c(-3L, 1L,
2L, -2L, -3L, 2L, 5L, 7L, 7L, 4L, 3L, 2L, 1L), Secvalue = c(38L,
-2L, -1L, 8L, 46L, 38L, 37L, 58L, -6L, -2L, -6L, 5L, -2L), Desired = structure(c(NA,
1L, NA, NA, 2L, 2L, NA, NA, NA, NA, NA, 1L, 1L), .Label = c("Primary",
"Secondary"), class = "factor")), .Names = c("ID", "Primrely",
"Primset", "Primvalue", "Secrely", "Secset", "Secvalue", "Desired"
), class = "data.frame", row.names = c(NA, -13L))
ID Primrely Primset Primvalue Secrely Secset Secvalue Desired
1 1 0 -4 45 5 -3 38 <NA>
2 1 2 -3 5 7 1 -2 Primary
3 1 1 1 6 2 2 -1 <NA>
4 1 1 2 15 1 -2 8 <NA>
5 2 1 -4 53 2 -3 46 Secondary
6 2 1 5 45 0 2 38 Secondary
7 2 3 3 44 4 5 37 <NA>
8 3 4 1 65 5 7 58 <NA>
9 4 4 2 1 1 7 -6 <NA>
10 4 3 -4 5 1 4 -2 <NA>
11 4 1 -2 1 1 3 -6 <NA>
12 5 2 -3 12 0 2 5 Primary
13 5 2 3 5 2 1 -2 Primary
For each ID, I'd like to select rows that meet the criteria (Prim = primary, Sec = secondary): If Primrely is 0 or 2 and Primset is -3:3, select all rows for each ID. If no rows for a given ID meet the primary criteria, select rows that meet the secondary criteria (Secrely is 0 or 2 and Secset is -3:3). Ideally, I'd like to add a column (Desired) that indicate which criteria was met (primary/secondary/NA).
I've been working with ifelse and if else functions without much luck mainly because I don't know how to command R to ingore a given ID if the primary criteria was already met (eg ID #1 meets the second criteria but doesn't need it because it already met the first criteria). In other words, if a 'primary' shows up in a given ID, it trumps all the 'secondary' criteria that were met. I would appreciate any advice.
If I understand you correctly now:
(left in the steps to show you what I was doing, you can remove them and/or do this all in one step if you want)
dat <- structure(list(ID = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 4L, 4L,
4L, 5L, 5L), Primrely = c(0L, 2L, 1L, 1L, 1L, 1L, 3L, 4L, 4L,
3L, 1L, 2L, 2L), Primset = c(-4L, -3L, 1L, 2L, -4L, 5L, 3L, 1L,
2L, -4L, -2L, -3L, 3L), Primvalue = c(45L, 5L, 6L, 15L, 53L,
45L, 44L, 65L, 1L, 5L, 1L, 12L, 5L), Secrely = c(5L, 7L, 2L,
1L, 2L, 0L, 4L, 5L, 1L, 1L, 1L, 0L, 2L), Secset = c(-3L, 1L,
2L, -2L, -3L, 2L, 5L, 7L, 7L, 4L, 3L, 2L, 1L), Secvalue = c(38L,
-2L, -1L, 8L, 46L, 38L, 37L, 58L, -6L, -2L, -6L, 5L, -2L), Desired = structure(c(NA,
1L, NA, NA, 2L, 2L, NA, NA, NA, NA, NA, 1L, 1L), .Label = c("Primary",
"Secondary"), class = "factor")), .Names = c("ID", "Primrely",
"Primset", "Primvalue", "Secrely", "Secset", "Secvalue", "Desired"
), class = "data.frame", row.names = c(NA, -13L))
within(dat, {
Desired_step1 <- ifelse(Primrely %in% c(0,2) & Primset %in% -3:3,
1, ifelse(Secrely %in% c(0,2) & Secset %in% -3:3,
2, 3))
Desired_new <- factor(ave(Desired_step1, ID, FUN = function(x)
ifelse(x == min(x), x, NA)),
levels = 1:3, labels = c('Primary', 'Secondary', 'NA'))
Desired_step1 <- c('1'='Primary','2'='Secondary','3'=NA)[Desired_step1]
})
# ID Primrely Primset Primvalue Secrely Secset Secvalue Desired Desired_new Desired_step1
# 1 1 0 -4 45 5 -3 38 <NA> <NA> <NA>
# 2 1 2 -3 5 7 1 -2 Primary Primary Primary
# 3 1 1 1 6 2 2 -1 <NA> <NA> Secondary
# 4 1 1 2 15 1 -2 8 <NA> <NA> <NA>
# 5 2 1 -4 53 2 -3 46 Secondary Secondary Secondary
# 6 2 1 5 45 0 2 38 Secondary Secondary Secondary
# 7 2 3 3 44 4 5 37 <NA> <NA> <NA>
# 8 3 4 1 65 5 7 58 <NA> NA <NA>
# 9 4 4 2 1 1 7 -6 <NA> NA <NA>
# 10 4 3 -4 5 1 4 -2 <NA> NA <NA>
# 11 4 1 -2 1 1 3 -6 <NA> NA <NA>
# 12 5 2 -3 12 0 2 5 Primary Primary Primary
# 13 5 2 3 5 2 1 -2 Primary Primary Primary
Here's my quick & dirty solution assuming your data.frame is named df. You can refine it yourself I think:
df$Desired <- ifelse((df$Primrely==0 | df$Primrely==2) & (df$Primset >= -3 & df$Primset <= 3),
"Primary",
NA)
idx <- is.na(df$Desired)
df$Desired[idx] <- ifelse((df$Secrely[idx]==0 | df$Secrely[idx]==2) & (df$Secset[idx] >= -3 & df$Secset[idx] <= 3),
"Secondary",
NA)