Counting 0s 1s and 2s in multiple column in r - r

My data looks like this:
structure(list(did = c(209L, 209L, 206L, 206L, 206L, 206L, 206L,
206L, 206L, 206L, 206L, 209L, 206L, 206L, 207L, 207L, 207L, 207L,
209L, 209L), hhid = c(5668, 5595, 4724, 4756, 4856, 4730, 4757,
6320, 4758, 6319, 6311, 5477, 6322, 6317, 134, 178, 238, 179,
5865, 5875), bc = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L,
1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L), rc = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
oap = c(2L, 2L, 0L, 0L, 0L, 0L, 0L, 2L, 0L, 2L, 2L, 0L, 2L,
2L, 2L, 2L, 2L, 2L, 0L, 0L)), row.names = c(NA, 20L), class = "data.frame")
hhid is unique for each row. For the remaining rows it consist of 0s and 1s in some columns and 0s 1s and 2s in other columns.
The output column required is like this:
did hh_count bc_0 bc_1 bc_2 rc_0 rc_1 rc_2 oap_0 oap_1 oap_2
where did will be unique.hh_count will be count of each hhid associated with did.
bc_0, bc_1 and bc_1 will be breakup of column bc and it will represent count of 0s 1s and 2s in bc.Simmilarily for rc_0,rc_1and rc_2 and oap_0,oap_1 and oap_2.So counting of 0s 1s and 2s is required

With counts of 3 specific values, writing the functions manually seems reasonable. If you need specific counts of more distinct values we could come up with a better way to generalize - probably converting your data to long format, summarizing, and then going back to wide.
library(dplyr) # across() requires dplyr version 1.0 or higher
dd %>% # (calling your data dd)
group_by(did) %>%
summarize(
hh_count = n_distinct(hhid),
across(c(bc, rc, oap),
.fns = list("0" = ~sum(. == 0), "1" = ~sum(. == 1), "2" = ~sum(. == 2)),
.names = "{.col}_{.fn}" # this is the default, but I show it explicitly
)
)
# # A tibble: 3 x 11
# did hh_count bc_0 bc_1 bc_2 rc_0 rc_1 rc_2 oap_0 oap_1 oap_2
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1 206 11 2 9 0 2 9 0 6 0 5
# 2 207 4 0 4 0 0 4 0 0 0 4
# 3 209 5 2 3 0 0 5 0 3 0 2

Related

Create a new data frame using values from existing data

I have a data frame that looks like this:
I need to make a new data frame that takes the values for density from the expr_phenotype column and puts them in the formula: ((density for 4 + density for 68)/(density for 0+4+64+68)*100). How should I go about this?
I know it's easier to make a reproducible example but this is a funky dataset.
We can use dplyr::group_by, then use summarise:
library(dplyr)
data %>%
group_by(core_x,core_y) %>%
summarise(result = sum(density[expr_phenotype %in% c(4,68)])/
sum(density[expr_phenotype %in% c(0,4,64,68)]) * 100)
## A tibble: 3 x 3
## Groups: core_x [1]
# core_x core_y result
# <int> <int> <dbl>
#1 1 1 39.8
#2 1 2 39.1
#3 1 3 51.8
Data (Obtained by OCR, please forgive errors)
data <- structure(list(core_x = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), core_y = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L), core = structure(c(1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L), .Label = c("1--1", "1--2",
"1--3"), class = "factor"), xX = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = "Cc", class = "factor"), phenotype = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "CD163", class = "factor"),
expr_phenotype = c(0L, 4L, 64L, 68L, 0L, 4L, 64L, 68L, 0L,
4L), count = c(510L, 334L, 1L, 4L, 186L, 116L, 1L, 3L, 196L,
210L), density = c(451L, 295L, 1L, 4L, 164L, 103L, 1L, 3L,
173L, 186L)), row.names = 3:12, class = "data.frame")

aggregation of data in R with assign of a dummy variable by condition

I have the following dataset
mydata=structure(list(id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), ad_id = c(111L, 111L, 111L,
111L, 1111L, 1111L, 11111L, 11111L, 11111L, 111L, 111L, 1111L,
1111L, 11111L, 11111L, 11111L, 111111L, 111111L), price = c(1L,
0L, 1L, 0L, 2L, 0L, 3L, 0L, 0L, 1L, 0L, 2L, 0L, 3L, 0L, 0L, 1L,
0L), rev = c(2L, 0L, 0L, 2L, 3L, 3L, 4L, 4L, 4L, 2L, 2L, 3L,
3L, 4L, 4L, 4L, 0L, 0L), data = structure(c(1L, 2L, 2L, 3L, 1L,
3L, 1L, 3L, 4L, 1L, 3L, 1L, 3L, 1L, 3L, 4L, 1L, 3L), .Label = c("01.01.2018",
"01.02.2018", "01.03.2018", "02.03.2018"), class = "factor")), .Names = c("id",
"ad_id", "price", "rev", "data"), class = "data.frame", row.names = c(NA,
-18L))
How can I create a dummy variable according to the following logic:
For each id and ad_id I need to aggregate by data price and rev. Each ad_id has a date column (data).
If for each id and ad_idfor the period up to 90 days(data column -d-m-y) rev is greater than the price, then the flag is set to 1 otherwise the flag is 0.
In this reproducible example , I just take 1 id and 4 ad_id.
In aggregated by sum form it is view
id ad_id price rev
1 1 111 2 4
2 1 1111 2 6
3 1 11111 3 12
4 1 111111 1 0
So for id=1 , all ad_id (besides ad_id = 111111) satisfy rev > price, so in initial data
ad_id = 111, 1111, 111111 must have flag = 1 and 111111 must have flag = 0.
Here is the desired output:
id ad_id price rev data flag
1 1 111 1 2 01.01.2018 1
2 1 111 0 0 01.02.2018 1
3 1 111 1 0 01.02.2018 1
4 1 111 0 2 01.03.2018 1
5 1 1111 2 3 01.01.2018 1
6 1 1111 0 3 01.03.2018 1
7 1 11111 3 4 01.01.2018 1
8 1 11111 0 4 01.03.2018 1
9 1 11111 0 4 02.03.2018 1
10 1 111111 1 0 01.01.2018 0
11 1 111111 0 0 01.03.2018 0
How to perform such condition
I am not sure if understood you correctly, but is this what you are looking for:
library(tidyverse)
mydata %>% as_tibble() %>%
group_by(id, ad_id) %>%
summarise_at(vars("price", "rev"), sum) %>%
mutate(flag = if_else(price > rev, 0, 1)) %>%
select(id, ad_id, flag) %>%
left_join(mydata, ., by = c("id", "ad_id"))

Subset data in r that includes group_by function

This is a follow up question to the following problem give here
I have the following data
Data:
df = structure(list(Org_ID = c(1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L),
Market_volume = c(100L, 200L, 300L, 50L, 500L, 400L, 200L,
300L, 100L), Indicator_variable = c(1L, 0L, 0L, 1L, 1L, 0L,
0L, 0L, 0L),variable3=c(10L, 1L, 1L, 4L, 2L, 3L, 3L, 10L, 3L),variable4=c(2L, 1L, 1L, 7L, 2L, 3L, 3L, 8L, 3L)).Names = c("Org_ID", "Market_volume", "Indicator_variable","Var3","Var4"
), class = "data.frame", row.names = c(NA, -9L))
Using (dplyr), i calculated the % of NA's by market volume by Org_ID via the following function
df %>%
group_by(Org_ID) %>%
summarize(sum_market_vol = sum(Market_volume*!Indicator_variable),
tot_market_vol = sum(Market_volume)) %>%
transmute(Org_ID, Perc_Market_Vol = 100*sum_market_vol/tot_market_vol)
Result:
# A tibble: 3 x 2
Org_ID Perc_Market_Vol
<int> <dbl>
1 1 83.33333
2 2 0.00000
3 3 100.00000
Question:
I want to subset my original data by deleting all rows of Org_ID (say 2) # X if perc_market_vol<30. That is i do not want to delete individual rows of the same org_id, but Org_id as a whole, say all counts of Org_id =1 or org_id = 2. How can i subset it linking two tables or functions?
I want the new data look like this:
df1 = structure(list(Org_ID = c(1L, 1L, 1L, 3L, 3L, 3L, 3L),
Market_volume = c(100L, 200L, 300L, 400L, 200L,
300L, 100L), Indicator_variable = c(1L, 0L, 0L, 0L,
0L, 0L, 0L),variable3=c(10L, 1L, 1L, 3L, 3L, 10L, 3L),variable4=c(2L, 1L, 1L, 3L, 3L, 8L, 3L)).Names = c("Org_ID", "Market_volume", "Indicator_variable","Var3","Var4"
), class = "data.frame", row.names = c(NA, -7L))
You can filter without materializing the aggregated data frame by using group_by %>% filter, and in the filter you can calculate the aggregated condition per group:
df %>%
group_by(Org_ID) %>%
filter(sum(Market_volume * !Indicator_variable)/sum(Market_volume) > 0.3)
# A tibble: 7 x 5
# Groups: Org_ID [2]
# Org_ID Market_volume Indicator_variable Var3 Var4
# <int> <int> <int> <int> <int>
#1 1 100 1 10 2
#2 1 200 0 1 1
#3 1 300 0 1 1
#4 3 400 0 3 3
#5 3 200 0 3 3
#6 3 300 0 10 8
#7 3 100 0 3 3

Calculate the mean based on the previous consecutive trials

I would like to calculate the mean of every SACCADIC_RT for which COMMISSION_ERROR =1, that follows every 5 consecutive HITS=1, per ID per condition.
ID | TRIAL | TRIAL_TYPE| CONDITION | COMMISSION_ERROR | HITS| SACCADIC_RT
1 183 nogo square_1 1 -1 175
1 54 go square_1 -1 1 259
1 26 nogo square_1 1 -1 365
1 3 nogo square_1 1 -1 346
1 100 nogo square_1 1 -1 287
1 11 go square_1 -1 1 164
1 52 go square_1 -1 1 244
1 8 go square_1 -1 1 223
1 10 go square_1 -1 1 183
1 21 go square_1 -1 1 234
1 32 go square_1 1 -1 221
1 2 go square_1 -1 1 183
1 13 nogo square_1 0 -1 -1
1 87 nogo square_2 1 -1 228
1 95 nogo square_2 1 -1 274
1 111 go square_2 -1 1 305
1 28 nogo square_2 0 -1 309
1 65 go square_2 -1 0 -1
1 40 nogo square_1 0 -1 199
1 19 nogo square_1 0 -1 207
1 28 go square_1 -1 1 257
2 45 nogo square_1 1 -1 169
2 197 nogo square_1 1 -1 350
2 115 nogo square_1 1 -1 321
2 65 go square_2 -1 1 298
2 24 go square_2 -1 0 -1
2 1 nogo square_2 1 -1 183
2 77 go square_2 -1 1 225
2 90 go square_2 -1 1 305
2 89 go square_2 -1 1 210
2 104 go square_2 -1 1 199
2 116 go square_2 -1 1 175
2 29 nogo square_2 1 -1 99
2 41 go square_2 -1 1 104
The sample table can be recreated in r as:
structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), X..TRIAL.. = c(183L, 54L, 26L,
3L, 100L, 11L, 52L, 8L, 10L, 21L, 32L, 2L, 13L, 87L, 95L, 111L,
28L, 65L, 40L, 19L, 28L, 45L, 197L, 115L, 65L, 24L, 1L, 77L,
90L, 89L, 104L, 116L, 29L, 41L), TRIAL_TYPE. = structure(c(2L,
1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L,
1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L,
1L), .Label = c("go", "nogo"), class = "factor"), CONDITION = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("square_1", "square_2"), class = "factor"), X..COMMISSION_ERROR = c(1L,
-1L, 1L, 1L, 1L, -1L, -1L, -1L, -1L, -1L, 1L, -1L, 0L, 1L, 1L,
-1L, 0L, -1L, 0L, 0L, -1L, 1L, 1L, 1L, -1L, -1L, 1L, -1L, -1L,
-1L, -1L, -1L, 1L, -1L), X..HITS. = c(-1L, 1L, -1L, -1L, -1L,
1L, 1L, 1L, 1L, 1L, -1L, 1L, -1L, -1L, -1L, 1L, -1L, 0L, -1L,
-1L, 1L, -1L, -1L, -1L, 1L, 0L, -1L, 1L, 1L, 1L, 1L, 1L, -1L,
1L), SACCADIC_RT = c(175L, 259L, 365L, 346L, 287L, 164L, 244L,
223L, 183L, 234L, 221L, 183L, -1L, 228L, 274L, 305L, 309L, -1L,
199L, 207L, 257L, 169L, 350L, 321L, 298L, -1L, 183L, 225L, 305L,
210L, 199L, 175L, 99L, 104L)), .Names = c("ID", "X..TRIAL..",
"TRIAL_TYPE.", "CONDITION", "X..COMMISSION_ERROR", "X..HITS.",
"SACCADIC_RT"), class = "data.frame", row.names = c(NA, -34L))
So the result from this example will be like:
ID | CONDITION | x
1 square_1 221
2 square_2 99
You can use the package data.table to perform this task.
The steps are then as follows:
1) for each ID and condition calculate the rolling sum of hits
2) take only rows which satisfy 2 conditions: commision_error = 1 on previous row, there is number 5 in rolling sum column
3) calculate mean for each ID and condition in the table created in step 2
# load your data
data <- read.csv("./yourData.csv")
# load data table library
library(data.table)
# convert your data to data.table object
data <- data.table(data)
# group data by ID and Condition, calculate rolling sum over 5 rows
data[, roll := Reduce('+', shift(HITS, 0:4)), by = list(ID, CONDITION)]
# take only rows where there were 5 hits in a row and commission error is 1
newData <- data[shift(roll, 1) == 5 & COMMISSION_ERROR == 1]
# calculate mean of SACCADIC_RT for each ID and Condition in the new dataset
newData[, meanSacc := mean(SACCADIC_RT), by = list(ID, CONDITION)]
Code Explained:
roll := Reduce('+', shift(HITS, 0:4))
The shift function allows you to calculate value in current row based on the value in some previous row. Here the 'Reduce('+', shift(HITS, 0:4)) 'take value of hits on a given row and add to that the value of hits on 3 previous rows. This value is then written to the new column called roll.
newData <- data[shift(roll, 1) == 5 & COMMISSION_ERROR == 1]
The above code keeps only rows from the original dataset, where there is value 5 in the previous row of the column roll and the value of COMISSION_ERROR on the current line is equal to 1.
newData[, meanSacc := mean(SACCADIC_RT), by = list(ID, CONDITION)]
The above snip calculate mean of SACCADIC_RT for each ID and CONDITION and the mean is calculated only from rows in the new dataset created above. The means are then written to a new value called meanSacc

R: Obtain aggregations of two or more variables by two or more grouping variables

With the following dataframe I need to obtain monthly sums of the following two variables: "CallsHandled" and "Engaged"
By the following grouping variables: "Month","ID","Location","LANGUAGE","MemRegion"
structure(list(Month = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Week = c(1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L), ID = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("A1234",
"F1234"), class = "factor"), Location = structure(c(2L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L), .Label = c("Corona", "Denver"), class = "factor"), LANGUAGE = structure(c(1L,
2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L,
2L, 2L, 1L), .Label = c("English", "Spanish"), class = "factor"),
MemRegion = structure(c(1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("NCAL",
"SCAL"), class = "factor"), CallsHandled = c(1L, 1L, 8L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 10L, 1L, 3L, 1L, 8L, 1L, 6L,
1L, 1L, 2L), Engaged = c(120L, 30L, 1243L, 75L, 45L, 55L,
200L, 120L, 30L, 230L, 2065L, 45L, 55L, 200L, 1483L, 30L,
1243L, 75L, 45L, 55L), QueueA = c(0L, 0L, 0L, 1L, 1L, 0L,
0L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L),
QueueB = c(1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 1L, 0L, 1L, 0L, 1L), QueueC = c(0L, 1L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L,
0L)), .Names = c("Month", "Week", "ID", "Location", "LANGUAGE",
"MemRegion", "CallsHandled", "Engaged", "QueueA", "QueueB", "QueueC"
), class = "data.frame", row.names = c(NA, -20L))
Additionally, in order to include "Queues A:C" as grouping variables, would I have to combine them into a single column? If so, how?
So there are 2 parts to this question, firstly how do you group things up and sum, and secondly how could you combine Queue A:C into one column.
For the first question you can use the library dplyr which makes it a lot easier and more intuitive.
df <- structure(list(Month = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
Week = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L),
ID = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("A1234", "F1234"), class = "factor"),
Location = structure(c(2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L), .Label = c("Corona", "Denver"), class = "factor"),
LANGUAGE = structure(c(1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L), .Label = c("English", "Spanish"), class = "factor"),
MemRegion = structure(c(1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("NCAL", "SCAL"), class = "factor"),
CallsHandled = c(1L, 1L, 8L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 10L, 1L, 3L, 1L, 8L, 1L, 6L, 1L, 1L, 2L),
Engaged = c(120L, 30L, 1243L, 75L, 45L, 55L, 200L, 120L, 30L, 230L, 2065L, 45L, 55L, 200L, 1483L, 30L, 1243L, 75L, 45L, 55L),
QueueA = c(0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L),
QueueB = c(1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 1L),
QueueC = c(0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L)), .Names = c("Month", "Week", "ID", "Location", "LANGUAGE", "MemRegion", "CallsHandled", "Engaged", "QueueA", "QueueB", "QueueC"),
class = "data.frame", row.names = c(NA, -20L))
library(dplyr)
df %>% group_by(Month, ID, Location, LANGUAGE) %>%
mutate(TotalCallsHandled = sum(CallsHandled),
TotalEngaged = sum(Engaged))
So firstly, we use group_by to group by specifically the variables you have listed, and mutate to sum everything up, and this will do what you think intuitively.
For combining everything into one column, there are probably many ways to do this, but probably the most straight forward way is to create some kind of unique identifier for each column and combine all the columns into one.
df$Queue <- as.factor(df$QueueA + df$QueueB*2 + df$QueueC*3)
levels(df$Queue) <- c("A", "B", "C")
Since everything should be a 0, 1 flag, we can recreate the flags to be 1 -> A, 2 -> B, 3 -> C, and then relevel the factors to be A, B, C again. Then we can simply use group_by function again to get the intended result as above.
df %>% group_by(Month, ID, Location, LANGUAGE, Queue) %>%
mutate(TotalCallsHandled = sum(CallsHandled),
TotalEngaged = sum(Engaged)) %>%
select(-QueueA, -QueueB, -QueueC)
With output:
Source: local data frame [20 x 11]
Groups: Month, ID, Location, LANGUAGE, Queue
Month Week ID Location LANGUAGE MemRegion CallsHandled Engaged Queue
1 1 1 F1234 Denver English NCAL 1 120 B
2 1 2 F1234 Corona Spanish SCAL 1 30 C
3 1 3 F1234 Corona English NCAL 8 1243 B
4 1 4 F1234 Corona Spanish NCAL 1 75 A
5 1 5 F1234 Corona Spanish SCAL 1 45 A
6 1 6 F1234 Denver English SCAL 2 55 B
7 1 7 F1234 Corona English NCAL 1 200 C
8 1 8 F1234 Corona English NCAL 1 120 B
9 1 9 F1234 Denver English NCAL 1 30 A
10 1 10 F1234 Corona Spanish NCAL 1 230 C
11 1 1 A1234 Corona English NCAL 10 2065 C
12 1 2 A1234 Corona English SCAL 1 45 A
13 1 3 A1234 Corona Spanish NCAL 3 55 A
14 1 4 A1234 Corona English NCAL 1 200 A
15 1 5 A1234 Corona English SCAL 8 1483 B
16 1 6 A1234 Denver English SCAL 1 30 B
17 1 7 A1234 Corona Spanish SCAL 6 1243 C
18 1 8 A1234 Corona Spanish SCAL 1 75 B
19 1 9 A1234 Corona Spanish SCAL 1 45 C
20 1 10 A1234 Corona English SCAL 2 55 B
Variables not shown: TotalCallsHandled (int), TotalEngaged (int)
To make the Queue variables into a single factor variable, you could do this:
queues <- which(dat[ , c("QueueA", "QueueB", "QueueC")]==1, arr.ind=TRUE)
queues<-queues[
order(queues[,"row"]), "col"]
queues<-factor(queues, labels=c("QueueA", "QueueB", "QueueC"))
dat <- data.frame(dat, queues)
Though, #chappers approach for this is nicer.
Then, you can use aggregate:
aggregate(dat[,c("CallsHandled", "Engaged")],
by=list(dat$Month, dat$ID, dat$Location, dat$LANGUAGE, dat$MemRegion, dat$queues),
sum)
# Group.1 Group.2 Group.3 Group.4 Group.5 Group.6 CallsHandled Engaged
#1 1 A1234 Corona English NCAL QueueA 1 200
#2 1 F1234 Denver English NCAL QueueA 1 30
#3 1 A1234 Corona Spanish NCAL QueueA 3 55
#4 1 F1234 Corona Spanish NCAL QueueA 1 75
#5 1 A1234 Corona English SCAL QueueA 1 45
#6 1 F1234 Corona Spanish SCAL QueueA 1 45
#7 1 F1234 Corona English NCAL QueueB 9 1363
#8 1 F1234 Denver English NCAL QueueB 1 120
#9 1 A1234 Corona English SCAL QueueB 10 1538
#10 1 A1234 Denver English SCAL QueueB 1 30
#11 1 F1234 Denver English SCAL QueueB 2 55
#12 1 A1234 Corona Spanish SCAL QueueB 1 75
#13 1 A1234 Corona English NCAL QueueC 10 2065
#14 1 F1234 Corona English NCAL QueueC 1 200
#15 1 F1234 Corona Spanish NCAL QueueC 1 230
#16 1 A1234 Corona Spanish SCAL QueueC 7 1288
#17 1 F1234 Corona Spanish SCAL QueueC 1 30
#chappers solution aggregates correctly but leaves me with a bunch of duplicate rows for some reason that I can't figure out. This works for factors and reduces the number of rows in my actual dataframe (no duplicates):
aggregate(cbind(CallsHandled,Engaged~Month + ID + Location + LANGUAGE + MemRegion, data=df, sum, na.rm=TRUE)

Resources