Merging two data frames in long format based on date - r

I have a 2 data frames, one (df1) that records the daily occurrence of different activities and another (df2) that records properties of the occurred activity during the day.
From df1 it is possible to identify the repeated occurrence of an activity as well the duration. When the day starts is specified by the Date variable.
For example:
id 12 the occurrence starts at day1 and ends at d7. In this case the occurrence is 7 and duration is 11.
for id 123 the week starts at day 5 and ends at d7; occurred in repeated order because of there are gap days at day 6 and duration is 6 and id 123 (starts at day6 ends at day 7) occurred 2 times consecutively and duration 6.
In df1 the variable Date defines the day when the record started. For example id 12 record started at day1 and so on.
I would like to identify if during the consecutive occurrence if there are records on the activity properties in df2.
For example id 12, occurred 7 times and duration is 12 there is record for Wednesday (day3 in df1) and this record corresponds to the 3 day of the consecutive occurrence. For id 123 there is no data (eg. no consecutive occurrence) but for id 10 for 6 day occurrence and duration 18 there is a record on the 6th day.
Df1:
id day1 day2 day3 day4 day5 day6 day7 Date
12 2 1 2 1 1 3 1 Mon
123 0 3 0 3 3 0 3 Fri
10 0 3 3 3 3 3 3 Sat
Df2:
id c1 c2 Date
12 3 3 Wednesday
123 3 2 Fri
10 3 1 Sat
Outcome:
id c1 c2 Occurrence Position
12 3 3 7 3
123 0 0 0 0
10 3 1 2 1
Sample data: df1
structure(list(id = c(12L, 123L, 10L), day1 = c(2L, 0L, 3L),
day2 = c(1L, 3L, 3L), day3 = c(2L, 0L, 3L), day4 = c(1L,
3L, 3L), day5 = c(1L, 3L, 3L), day6 = c(3L, 0L, 3L), day7 = c(1L,
3L, 3L), Date = c("Monday", "Friday", "Saturday")), row.names = c(NA,
-3L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x000002a81a571ef0>)
df2:
structure(list(id = c(12, 123, 10), c1 = c(3, 3, 3), c2 = c(3,
2, 1), Date = structure(c(3L, 1L, 2L), .Label = c("Friday", "Saturday",
"Wednesday"), class = "factor")), row.names = c(NA, -3L), class = "data.frame")

A solution with dplyr (maybe not the shortest one):
# library
library(tidyverse)
# get data
df1 <- structure(list(id = c(12L, 123L, 10L),
day1 = c(2L, 0L, 3L),
day2 = c(1L, 3L, 3L),
day3 = c(2L, 0L, 3L),
day4 = c(1L,3L, 3L),
day5 = c(1L, 3L, 3L),
day6 = c(3L, 0L, 3L),
day7 = c(1L,3L, 3L),
Date = c("Monday", "Friday", "Saturday")),
row.names = c(NA,-3L), class = c("data.table", "data.frame"))
df2 <- structure(list(id = c(12, 123, 10),
c1 = c(3, 3, 3),
c2 = c(3, 2, 1),
Date = structure(c(3L, 1L, 2L), .Label = c("Friday", "Saturday","Wednesday"),
class = "factor")), row.names = c(NA, -3L), class = "data.frame")
# change days to numeric (will help you later)
df1 %>% mutate(
Date_nr_df1=case_when(
Date=="Monday" ~ 1,
Date=="Tuesday" ~2,
Date=="Wednesday" ~3,
Date=="Thursday" ~4,
Date=="Friday" ~5,
Date=="Saturday" ~6,
Date=="Sunday" ~7)) -> df1
df2 %>% mutate(
Date_nr_df2=case_when(
Date=="Monday" ~ 1,
Date=="Tuesday" ~2,
Date=="Wednesday" ~3,
Date=="Thursday" ~4,
Date=="Friday" ~5,
Date=="Saturday" ~6,
Date=="Sunday" ~7)) -> df2
# combine data by the id column
left_join(df1,df2, by=c("id")) -> df
# adjust data
df %>%
group_by(id) %>% # to make changes per row
mutate(days=paste0(day1,day2,day3,day4,day5,day6,day7)) %>% #pastes the values together
mutate(days_correct=substring(days,Date_nr_df1)) %>% # applies the start day
mutate(Occurrence_seq=str_split(days_correct, fixed("0"))[[1]][1]) %>% # extracts all days before 0
mutate(Occurrence=nchar(Occurrence_seq)) %>% ## counts these days
mutate(Occurrence=case_when(Occurrence==1 ~ 0, TRUE ~ as.numeric(Occurrence))) %>% # sets Occurrence to 0 if there is no consecutive occurrence
mutate(Position=Date_nr_df2-Date_nr_df1+1) %>% ## calculates the position you wanted
mutate(c1=case_when(Occurrence==0 ~0, TRUE ~ c1),
c2=case_when(Occurrence==0 ~0, TRUE ~c1),
Position=case_when(Occurrence==0 ~ 0, TRUE ~ as.numeric(Position))) %>%
ungroup() %>% ungroups the df
select(id,c1,c2,Occurrence,Position) # selects the wanted variables
#> # A tibble: 3 x 5
#> id c1 c2 Occurrence Position
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 12 3 3 7 3
#> 2 123 0 0 0 0
#> 3 10 3 3 2 1
Created on 2020-04-10 by the reprex package (v0.2.1)

Related

Conditional cumulative sum in R with dplyr. All dates before current date

I am looking for a way use a cumulative sum within R with the condition of not including the current date.
I have the following data frame (which is a subset and simplified version of the real data frame):
df <- structure(list(date_time = structure(c(1609513200, 1609513200, 1609513200,
1609516800, 1609516800, 1609516800, 1609599600, 1609599600, 1609599600,
1609603200, 1609603200, 1609603200), tzone = "UTC", class = c("POSIXct",
"POSIXt")), event = c(1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L),
person = c("A", "B", "C", "A", "B", "C", "A", "B", "C", "A", "B", "C"),
did_attend = c(1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 1L),
events_attended = c(0, 0, 0, 1, 1, 1, 2, 2, 1, 2, 3, 2),
events_attended_desired = c(0L, 0L, 0L, 0L, 0L, 0L, 2L, 2L, 1L, 2L, 2L, 1L)),
class = c("grouped_df", "tbl_df", "tbl", "data.frame"),
row.names = c(NA, -12L), groups = structure(list(person = c("A", "B", "C"),
.rows = structure(list(c(1L, 4L, 7L, 10L), c(2L, 5L, 8L, 11L),
c(3L, 6L, 9L, 12L)), ptype = integer(0),
class = c("vctrs_list_of", "vctrs_vctr", "list"))),
class = c("tbl_df", "tbl", "data.frame"),
row.names = c(NA, -3L), .drop = TRUE))
df
## date_time event person did_attend events_attended events_attended_desired
## 2021-01-01 15:00:00 1 A 1 0 0
## 2021-01-01 15:00:00 1 B 1 0 0
## 2021-01-01 15:00:00 1 C 1 0 0
## 2021-01-01 16:00:00 2 A 1 1 0
## 2021-01-01 16:00:00 2 B 1 1 0
## 2021-01-01 16:00:00 2 C 0 1 0
## 2021-01-02 15:00:00 1 A 0 2 2
## 2021-01-02 15:00:00 1 B 1 2 2
## 2021-01-02 15:00:00 1 C 1 1 1
## 2021-01-02 16:00:00 2 A 1 2 2
## 2021-01-02 16:00:00 2 B 0 3 2
## 2021-01-02 16:00:00 2 C 1 2 1
The column "did_attend" is a dummy variable which signifies if a person attended the event. The "events_attended" column has obviously been produced by
events <- events %>%
arrange(date_time) %>%
group_by(person) %>%
mutate(events_attended = lag(cumsum(did_attend), default = 0)) %>%
ungroup()
Now I am looking for a way to not include the events of the current date, so the cumulative sum should only sum over the dates prior to the current date (The desired output is in events_attended_desired column). There are several events each day and the number of events is different on each day. So a lag version does not work. I tried several ifelse() in the cumsum function but they didn't work either because I don't know how to compare the dates in an ifelse clause within cumsum()
Here's an approach using dplyr and lubridate::floor_date.
First, I add a "date" column to the data frame, so that I can summarize and join based on the date.
Then I join this table to a summarized version of itself. count(date, wt = did_attend) is a shortcut for group_by(date) %>% summarize(n = sum(did_attend)), so if I then take the lag of that, we get the desired result.
df2 <- df %>%
mutate(date = lubridate::floor_date(date_time, "day"))
df2 %>%
left_join(
df2 %>%
count(date, wt = did_attend) %>%
mutate(prior_attended = cumsum(lag(n, default = 0))) %>%
select(-n)
)
Multiply each number by 1 if it corresponds to a prior date and 0 otherwise.
library(dplyr)
df %>%
mutate(events_attended = sapply(as.Date(date_time),
function(x) sum((as.Date(date_time) < x) * did_attend))) %>%
arrange(date_time) %>%
ungroup

Cumulative sums by month in R

I want to transform my data from this
Month Expenditures
1 1
1 2
2 3
2 6
3 2
3 5
to this:
Month Cumulative_expenditures
1 3
2 12
3 19
, but can't seem to figure out how to do it.
I tried using the cumsum() function, but it counts each observation - it doesn't distinguish between groups.
Any help would be much appreciated!
A two steps base R solution would be:
#Code
df1 <- aggregate(Expenditures~Month,data=mydf,sum)
#Create cum sum
df1$Expenditures <- cumsum(df1$Expenditures)
Output:
Month Expenditures
1 1 3
2 2 12
3 3 19
Some data used:
#Data
mydf <- structure(list(Month = c(1L, 1L, 2L, 2L, 3L, 3L), Expenditures = c(1L,
2L, 3L, 6L, 2L, 5L)), class = "data.frame", row.names = c(NA,
-6L))
Using dplyr:
library(dplyr)
df %>%
group_by(Month) %>%
summarise(Expenditures = sum(Expenditures), .groups = "drop") %>%
mutate(Expenditures = cumsum(Expenditures))
#> # A tibble: 3 x 2
#> Month Expenditures
#> <int> <int>
#> 1 1 3
#> 2 2 12
#> 3 3 19
Or in base R:
data.frame(Month = unique(df$Month),
Expenditure = cumsum(tapply(df$Expenditure, df$Month, sum)))
#> Month Expenditure
#> 1 1 3
#> 2 2 12
#> 3 3 19
Here is another base R option using subset + ave
subset(
transform(df, Expenditures = cumsum(Expenditures)),
ave(rep(FALSE, nrow(df)), Month, FUN = function(x) seq_along(x) == length(x))
)
which gives
Month Expenditures
2 1 3
4 2 12
6 3 19
We can use base R
out <- with(df1, rowsum(Expenditures, Month))
data.frame(Month = row.names(out), Expenditure = cumsum(out))
# Month Expenditure
#1 1 3
#2 2 12
#3 3 19
Or more compactly
with(df1, stack(cumsum(rowsum(Expenditures, Month)[,1])))[2:1]
data
df1 <- structure(list(Month = c(1L, 1L, 2L, 2L, 3L, 3L), Expenditures = c(1L,
2L, 3L, 6L, 2L, 5L)), class = "data.frame", row.names = c(NA,
-6L))

Matching the previous row in a specific column and performing a calculation in R

I currently have a data file that resembles this:
R ID A B
1 A1 0 0
2 A1 2 4
3 A1 4 8
4 A2 0 0
5 A2 3 3
6 A2 6 6
I would like to write a script that will only calculate "(8-4)/(4-2)" from the previous row only if the "ID" matches. For example, in the output for a column "C" in row 3, if A1 == A1 in the "ID" column, then (8-4)/(4-2) = 2. If A1 != A1, then output is 0.
I would like the output to be like this:
R ID A B C
1 A1 0 0 0
2 A1 2 4 2
3 A1 4 8 2
4 A2 0 0 0
5 A2 3 3 1
6 A2 6 6 1
Hopefully I explained this correctly in a non-confusing manner.
We could group_by ID, use diff to calculate difference between rows and divide.
library(dplyr)
df %>% group_by(ID) %>% mutate(C = c(0, diff(B)/diff(A)))
# R ID A B C
# <int> <fct> <int> <int> <dbl>
#1 1 A1 0 0 0
#2 2 A1 2 4 2
#3 3 A1 4 8 2
#4 4 A2 0 0 0
#5 5 A2 3 3 1
#6 6 A2 6 6 1
and similarly using data.table
library(data.table)
setDT(df)[, C := c(0, diff(B)/diff(A)), ID]
data
df <- structure(list(R = 1:6, ID = structure(c(1L, 1L, 1L, 2L, 2L,
2L), .Label = c("A1", "A2"), class = "factor"), A = c(0L, 2L,
4L, 0L, 3L, 6L), B = c(0L, 4L, 8L, 0L, 3L, 6L)), class = "data.frame",
row.names = c(NA, -6L))
We can also use lag
library(dplyr)
df %>%
group_by(ID) %>%
mutate(C = (B - lag(B, default = first(B)))/(A - lag(A, default = first(A))))
data
df <- structure(list(R = 1:6, ID = structure(c(1L, 1L, 1L, 2L, 2L,
2L), .Label = c("A1", "A2"), class = "factor"), A = c(0L, 2L,
4L, 0L, 3L, 6L), B = c(0L, 4L, 8L, 0L, 3L, 6L)), class = "data.frame",
row.names = c(NA, -6L))

Get the limited rows on date range from dataframe in R

I am having this dataframe.
token DD1 Type DD2 Price
AB-1 2018-01-01 10:12:15 Low 2018-01-25 10000
AB-5 2018-01-10 10:12:15 Low 2018-01-25 15000
AB-2 2018-01-05 12:25:04 High 2018-01-20 25000
AB-3 2018-01-03 17:04:25 Low 2018-01-27 50000
....
AB-8 2017-12-10 21:08:12 Low 2017-12-30 60000
AB-8 2017-12-10 21:08:12 High 2017-12-30 30000
dput:
structure(list(token = structure(c(2L, 5L, 3L, 4L, 1L, 6L, 6L
), .Label = c("....", "AB-1", "AB-2", "AB-3", "AB-5", "AB-8"), class = "factor"),
DD1 = structure(c(2L, 5L, 4L, 3L, 1L, 6L, 6L), .Label = c("",
"01/01/2018 10:12:15", "03/01/2018 17:04:25", "05/01/2018 12:25:04",
"10/01/2018 10:12:15", "10/12/2017 21:08:12"), class = "factor"),
Type = structure(c(3L, 3L, 2L, 3L, 1L, 3L, 2L), .Label = c("",
"High", "Low"), class = "factor"), DD2 = structure(c(3L,
3L, 2L, 4L, 1L, 5L, 5L), .Label = c("", "20/01/2018", "25/01/2018",
"27/01/2018", "30/12/2017"), class = "factor"), Price = c(10000L,
15000L, 25000L, 50000L, NA, 60000L, 30000L)), .Names = c("token",
"DD1", "Type", "DD2", "Price"), class = "data.frame", row.names = c(NA,
-7L))
From the above mentioned dataframe I want 2 kind of sub set data frame based on date (last three date in descending order (from DD2) if row is not available for particular date than show that date with all fields as '0') and month (last three date in descending order if row is not available for particular date than show that date with all fields as '0').
Formula for Avg Low (same for Avg high): DD2-DD1 and than take Median as per nrow available.
% Formula For month: (Recent Value-Old Value)/(Old Vaule)
The code should pick last three days data as well as last three months data from dataframe whenever i run the code.
DF1:
Date nrow for Low Med Low sum of value low nrow for High Med High sum of value High
27-01-2018 1 24 50000 0 0 0
26-01-2018 0 0 0 0 0 0
25-01-2018 2 19.5 25000 0 0 0
DF2
Month nrow low % sum low % nrow high % sum high %
Jan-18 3 200% 75000 25% 1 0% 25000 -17%
Dec-17 1 100% 60000 100% 1 100% 0 100%
Nov-17 0 - - - 0 - - -
Although this Q already has an accepted answer, I felt challenged to provide an answer which uses dcast() and melt(). Any missing dates and months are completed using CJ() and joins as requested by the OP.
The code tries to reproduce OP's expected results as close as possible. The particular customisation is why the code looks so much convoluted.
If requested, I am willing to explain the code in more detail.
library(data.table)
setDT(DF)
# daily
DF1 <-
DF[, .(n = .N, days = median(difftime(as.Date(DD2, "%d/%m/%Y"),
as.Date(DD1, "%d/%m/%Y"), units = "day")),
sum = sum(Price)), by = .(DD2, Type)][
, Date := as.Date(DD2, "%d/%m/%Y")][
, dcast(.SD, Date ~ Type, value.var = c("n", "days", "sum"), fill = 0)][
.(Date = seq(max(Date), length.out = 3L, by = "-1 days")), on = "Date"][
, setcolorder(.SD, c(1, 3, 5, 7, 2, 4, 6))][
is.na(n_Low), (2:7) := lapply(.SD, function(x) 0), .SDcols = 2:7][]
DF1
Date n_Low days_Low sum_Low n_High days_High sum_High
1: 2018-01-27 1 24.0 days 50000 0 0 days 0
2: 2018-01-26 0 0.0 days 0 0 0 days 0
3: 2018-01-25 2 19.5 days 25000 0 0 days 0
# monthly
DF2 <-
DF[, Month := lubridate::floor_date(as.Date(DD2, "%d/%m/%Y"), unit = "month")][
, .(n = .N, sum = sum(Price)), by = .(Month, Type)][
CJ(Month = seq(max(Month), length.out = 3L, by = "-1 months"), Type = unique(Type)),
on = .(Month, Type)][
, melt(.SD, id.vars = c("Month", "Type"))][
is.na(value), value := 0][
, Pct := {
old <- shift(value); round(100 * ifelse(old == 0, 1, (value - old) / old))
},
by = .(variable, Type)][
, dcast(.SD, Type + Month ~ variable, value.var = c("value", "Pct"))][
, setnames(.SD, c("value_n", "value_sum"), c("n", "sum"))][
, dcast(.SD, Month ~ Type, value.var = c("n", "Pct_n", "sum", "Pct_sum"))][
order(-Month), setcolorder(.SD, c(1, 3, 5, 7, 9, 2, 4, 6, 8))]
DF2
Month n_Low Pct_n_Low sum_Low Pct_sum_Low n_High Pct_n_High sum_High Pct_sum_High
1: 2018-01-01 3 200 75000 25 1 0 25000 -17
2: 2017-12-01 1 100 60000 100 1 100 30000 100
3: 2017-11-01 0 NA 0 NA 0 NA 0 NA
Does the following approach help?
require(tidyverse)
Edit
This is a very convoluted approach and is most certainly possible to be solved more elegantly.
dat <- structure(list(token = structure(c(2L, 5L, 3L, 4L, 1L, 6L, 6L), .Label = c("....", "AB-1", "AB-2", "AB-3", "AB-5", "AB-8"), class = "character"), DD1 = structure(c(2L, 5L, 4L, 3L, 1L, 6L, 6L), .Label = c("", "01/01/2018 10:12:15", "03/01/2018 17:04:25", "05/01/2018 12:25:04", "10/01/2018 10:12:15", "10/12/2017 21:08:12"), class = "factor"),
Type = structure(c(3L, 3L, 2L, 3L, 1L, 3L, 2L), .Label = c("", "High", "Low"), class = "character"), DD2 = structure(c(3L, 3L, 2L, 4L, 1L, 5L, 5L), .Label = c("", "20/01/2018", "25/01/2018", "27/01/2018", "30/12/2017"), class = "factor"), Price = c(10000L, 15000L, 25000L, 50000L, NA, 60000L, 30000L)), .Names = c("token", "DD1", "Type", "DD2", "Price"), class = "data.frame", row.names = c(NA, -7L))
#I have included this into the code because structure(your output) had messed up a lot with factors
dat <- dat[c(1:4,6:7),]
dat <- dat %>% mutate(DD1 = dmy_hms(DD1), DD2 = dmy(DD2), Type = as.character(Type))
dat_summary <- dat %>%
mutate(diff_days = round(as.duration(DD1%--%DD2)/ddays(1),0),
#uses lubridate to calculate the number of days between each DD2 and DD1
n = n()) %>%
group_by(DD2,Type) %>% #because your operations are performed by each Type by DD2
summarise(med = median(diff_days),# calculates the median
sum = sum(Price)) # and the sum
# A tibble: 5 x 4
# Groups: DD2 [?]
DD2 Type med sum
<date> <chr> <dbl> <int>
1 2017-12-30 2 19.0 30000
2 2017-12-30 3 19.0 60000
3 2018-01-20 2 14.0 25000
4 2018-01-25 3 19.5 25000
5 2018-01-27 3 23.0 50000
Now find the first day with a value in Price
datematch <- dat %>% group_by(Type,month = floor_date(DD2, "month")) %>%
arrange(Type, desc(DD2)) %>%
summarise(maxDate = max(DD2)) %>%
select(Type, maxDate)
now create helper data frames for merging. dummy_dates will contain the last day with a value and the previous two days, for both types (low and high), all_dates will contain... well, all dates
list1 <- split(datematch$maxDate, datematch$Type)
list_type2 <- do.call('c',lapply(list1[['2']], function(x) seq(as.Date(x)-2, as.Date(x), by="days")))
list_type3 <- do.call('c',lapply(list1[['3']], function(x) seq(as.Date(x)-2, as.Date(x), by="days")))
dd_2 <- data.frame (DD2 = list_type2, Type = as.character(rep('2', length(list_type2))), stringsAsFactors = F)
dd_3 <- data.frame (DD2 = list_type3, Type = as.character(rep('3', length(list_type3))), stringsAsFactors = F)
dummy_date = rbind(dd_2, dd_3)
seq_date <- seq(as.Date('2017-12-01'),as.Date('2018-01-31'), by = 'days')
all_dates <- data.frame (DD2 = rep(seq_date,2), Type = as.character(rep(c('2','3'),each = length(seq_date))),stringsAsFactors = F)
now we can join your data frame with all days, so that every single day in the month gets a row
all_dates <- left_join(dd_date, dat_summary, by = c('DD2', 'Type'))
and we can filter this result with dummy_date, which (as we remember) contains only the required days before the last day with data
df1<- left_join(dummy_date, all_dates, by = c('DD2', 'Type')) %>% arrange(Type, desc(DD2))
df1
DD2 Type med sum
1 2018-01-20 2 14.0 25000
2 2018-01-19 2 NA NA
3 2018-01-18 2 NA NA
4 2017-12-30 2 19.0 30000
5 2017-12-29 2 NA NA
6 2017-12-28 2 NA NA
7 2018-01-27 3 23.0 50000
8 2018-01-26 3 NA NA
9 2018-01-25 3 19.5 25000
10 2017-12-30 3 19.0 60000
11 2017-12-29 3 NA NA
12 2017-12-28 3 NA NA
Sorry that 'type' is not correctly put as low and high, had problems to read your data. I hope that this helps somewhat
edit
added suggestion for a way to get to DF2
df1 %>% group_by(Type, month = floor_date(DD2, 'month')) %>%
summarise(sum = sum(sum, na.rm = T),
n = max (n1, na.rm = T)) %>%
unite(sum.n, c('sum','n')) %>%
spread(Type, sum.n) %>%
rename(low = '3', high = '2') %>%
separate(high, c('high','n_high')) %>%
separate(low, c('low','n_low')) %>%
mutate(dummy_low = as.integer(c(NA, low[1:length(low)-1])),
dummy_high = as.integer(c(NA, high[1:length(high)-1])),
low = as.integer(low),
high = as.integer(high))%>%
mutate(perc_low = 100*(low-dummy_low)/dummy_low)
# A tibble: 2 x 8
month high n_high low n_low dummy_low dummy_high perc_low
<date> <int> <chr> <int> <chr> <int> <int> <dbl>
1 2017-12-01 30000 1 60000 1 NA NA NA
2 2018-01-01 25000 1 75000 3 60000 30000 25.0
It's up to you to add the remaining columns for 'high' and the count. I am sure that the solution is not the most elegant one but it should work. DF2 has now only two months, but this is because you have provided only 2 months in your example. It should work with any number of months, and you can then filter the last three months.

Calculate column sums for each combination of two grouping variables [duplicate]

This question already has answers here:
How to sum a variable by group
(18 answers)
Closed 7 years ago.
I have a dataset that looks something like this:
Type Age count1 count2 Year Pop1 Pop2 TypeDescrip
A 35 1 1 1990 30000 50000 alpha
A 35 3 1 1990 30000 50000 alpha
A 45 2 3 1990 20000 70000 alpha
B 45 2 1 1990 20000 70000 beta
B 45 4 5 1990 20000 70000 beta
I want to add the counts of the rows that are matching in the Type and Age columns. So ideally I would end up with a dataset that looks like this:
Type Age count1 count2 Year Pop1 Pop2 TypeDescrip
A 35 4 2 1990 30000 50000 alpha
A 45 2 3 1990 20000 70000 alpha
B 45 6 6 1990 20000 70000 beta
I've tried using nested duplicated() statements such as below:
typedup = duplicated(df$Type)
bothdup = duplicated(df[(typedup == TRUE),]$Age)
but this returns indices for which age or type are duplicated, not necessarily when one row has duplicates of both.
I've also tried tapply:
tapply(c(df$count1, df$count2), c(df$Age, df$Type), sum)
but this output is difficult to work with. I want to have a data.frame when I'm done.
I don't want to use a for-loop because my dataset is quite large.
Try
library(dplyr)
df1 %>%
group_by(Type, Age) %>%
summarise_each(funs(sum))
# Type Age count1 count2
#1 A 35 4 2
#2 A 45 2 3
#3 B 45 6 6
In the newer versions of dplyr
df1 %>%
group_by(Type, Age) %>%
summarise_all(sum)
Or using base R
aggregate(.~Type+Age, df1, FUN=sum)
# Type Age count1 count2
#1 A 35 4 2
#2 A 45 2 3
#3 B 45 6 6
Or
library(data.table)
setDT(df1)[, lapply(.SD, sum), .(Type, Age)]
# Type Age count1 count2
#1: A 35 4 2
#2: A 45 2 3
#3: B 45 6 6
Update
Based on the new dataset,
df2 %>%
group_by(Type, Age,Pop1, Pop2, TypeDescrip) %>%
summarise_each(funs(sum), matches('^count'))
# Type Age Pop1 Pop2 TypeDescrip count1 count2
#1 A 35 30000 50000 alpha 4 2
#2 A 45 20000 70000 beta 2 3
#3 B 45 20000 70000 beta 6 6
data
df1 <- structure(list(Type = c("A", "A", "A", "B", "B"), Age = c(35L,
35L, 45L, 45L, 45L), count1 = c(1L, 3L, 2L, 2L, 4L), count2 = c(1L,
1L, 3L, 1L, 5L)), .Names = c("Type", "Age", "count1", "count2"
), class = "data.frame", row.names = c(NA, -5L))
df2 <- structure(list(Type = c("A", "A", "A", "B", "B"), Age = c(35L,
35L, 45L, 45L, 45L), count1 = c(1L, 3L, 2L, 2L, 4L), count2 = c(1L,
1L, 3L, 1L, 5L), Year = c(1990L, 1990L, 1990L, 1990L, 1990L),
Pop1 = c(30000L, 30000L, 20000L, 20000L, 20000L), Pop2 = c(50000L,
50000L, 70000L, 70000L, 70000L), TypeDescrip = c("alpha",
"alpha", "beta", "beta", "beta")), .Names = c("Type", "Age",
"count1", "count2", "Year", "Pop1", "Pop2", "TypeDescrip"),
class = "data.frame", row.names = c(NA, -5L))
#hannah you can also use sql using the sqldf package
sqldf("select
Type,Age,
sum(count1) as sum_count1,
sum(count2) as sum_count2
from
df
group by
Type,Age
")

Resources