count zeros before a different value by ID in r - r

I need to count zeros by ID in R, but I need them just the zeros before a different Value.
Here's a example of my data.
x<-tibble::tribble(
~ID, ~Date, ~Value,
1L, "01-01-2016", 0L,
1L, "01-02-2016", 0L,
1L, "01-03-2016", 0L,
1L, "01-04-2016", 0L,
1L, "01-05-2016", 1000L,
1L, "01-06-2016", 0L,
2L, "01-01-2016", 0L,
2L, "01-02-2016", 0L,
2L, "01-03-2016", 1500L,
2L, "01-04-2016", 0L,
2L, "01-05-2016", 0L,
2L, "01-06-2016", 0L,
3L, "01-01-2016", 0L,
3L, "01-02-2016", -300L,
3L, "01-03-2016", 0L,
3L, "01-04-2016", 0L,
3L, "01-05-2016", 200L,
3L, "01-06-2016", 0L
)
x<-as.data.frame(x)
head(x)
# ID Date Value
# 1 1 01-01-2016 0
# 2 1 01-02-2016 0
# 3 1 01-03-2016 0
# 4 1 01-04-2016 0
# 5 1 01-05-2016 1000
# 6 1 01-06-2016 0
What I'm looking it's something like this
ID 1 2 3
Count 4 2 1
Because there's 4 zeros before a different value in ID 1, 2 zeros in ID 2 and 1 zero in ID 3. Also, I'd like to save it in the dataframe as it follows:
new_x<-tibble::tribble(
~ID, ~Date, ~Value, ~Count,
1L, "01-01-2016", 0L, 4L,
1L, "01-02-2016", 0L, 4L,
1L, "01-03-2016", 0L, 4L,
1L, "01-04-2016", 0L, 4L,
1L, "01-05-2016", 1000L, 4L,
1L, "01-06-2016", 0L, 4L,
2L, "01-01-2016", 0L, 2L,
2L, "01-02-2016", 0L, 2L,
2L, "01-03-2016", 1500L, 2L,
2L, "01-04-2016", 0L, 2L,
2L, "01-05-2016", 0L, 2L,
2L, "01-06-2016", 0L, 2L,
3L, "01-01-2016", 0L, 1L,
3L, "01-02-2016", -300L, 1L,
3L, "01-03-2016", 0L, 1L,
3L, "01-04-2016", 0L, 1L,
3L, "01-05-2016", 200L, 1L,
3L, "01-06-2016", 0L, 1L
)
Does anyone how to solve it? Thanks!

One way would be:
library(tidyverse)
x %>%
group_by(ID) %>%
summarise(n = sum(cumsum(Value != 0) == 0, na.rm = TRUE)) %>%
pivot_wider(names_from = ID, values_from = n) %>%
add_column(ID = 'Count', .before = 1) %>%
as.data.frame
Output:
ID 1 2 3
1 Count 4 2 1
For saving as column:
x %>%
group_by(ID) %>%
mutate(Count = sum(cumsum(Value != 0) == 0, na.rm = TRUE))
There's of course also no need for packages:
transform(x, Count = ave(Value, ID, FUN = function(x) sum(cumsum(x != 0) == 0, na.rm = TRUE)))

Here's another way to do it using purrr's detect_index:
library(purrr)
library(dplyr)
x %>%
group_by(ID) %>%
mutate(Count = detect_index(Value, ~ .x != 0) - 1)
#> # A tibble: 18 x 4
#> # Groups: ID [3]
#> ID Date Value Count
#> <int> <chr> <int> <dbl>
#> 1 1 01-01-2016 0 4
#> 2 1 01-02-2016 0 4
#> 3 1 01-03-2016 0 4
#> 4 1 01-04-2016 0 4
#> 5 1 01-05-2016 1000 4
#> 6 1 01-06-2016 0 4
#> 7 2 01-01-2016 0 2
#> 8 2 01-02-2016 0 2
#> 9 2 01-03-2016 1500 2
#> 10 2 01-04-2016 0 2
#> 11 2 01-05-2016 0 2
#> 12 2 01-06-2016 0 2
#> 13 3 01-01-2016 0 1
#> 14 3 01-02-2016 -300 1
#> 15 3 01-03-2016 0 1
#> 16 3 01-04-2016 0 1
#> 17 3 01-05-2016 200 1
#> 18 3 01-06-2016 0 1

Related

How to do multiple arithmetic operations in R by group

have several datasets. The first one
lid=structure(list(x1 = 619490L, x2 = 10L, x3 = 0L, x4 = 6089230L,
x5 = 0L, x6 = -10L), class = "data.frame", row.names = c(NA,
-1L))
second dataset
lidar=structure(list(A = c(638238.76, 638238.76, 638239.29, 638235.39,
638233.86, 638233.86, 638235.55, 638231.97, 638231.91, 638228.41,
638238.76, 638238.76, 63239.29, 638235.39, 638233.86, 638233.86,
638235.55, 638231.97, 638231.91, 638228.41), B = c(6078001.09,
6078001.09, 6078001.15, 6078001.15, 6078001.07, 6078001.07, 6078001.02,
6078001.08, 6078001.09, 6078001.01, 6078001.09, 6078001.09, 6078001.15,
6078001.15, 6078001.07, 6078001.07, 6078001.02, 6078001.08, 6078001.09,
6078001.01), C = c(186.64, 186.59, 199.28, 189.37, 186.67, 186.67,
198.04, 200.03, 199.73, 192.14, 186.64, 186.59, 199.28, 189.37,
196.67, 186.67, 198.04, 200.03, 199.73, 100.14), gpstime = c(319805734.664265,
319805734.664265, 319805734.67875, 319805734.678768, 319805734.678777,
319805734.678777, 319805734.687338, 319805734.701928, 319805734.701928,
319805734.701945, 319805734.664265, 319805734.664265, 319805734.67875,
319805734.678768, 319805734.678777, 319805734.678777, 319805734.687338,
319805734.701928, 319805734.701928, 319805734.701945), Intensity = c(13L,
99L, 5L, 2L, 20L, 189L, 2L, 11L, 90L, 1L, 13L, 99L, 5L, 2L, 20L,
189L, 2L, 11L, 90L, 1L), ReturnNumber = c(2L, 1L, 1L, 2L, 1L,
1L, 2L, 1L, 1L, 3L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 3L),
NumberOfReturns = c(2L, 1L, 3L, 2L, 1L, 1L, 3L, 1L, 1L, 4L,
2L, 1L, 3L, 2L, 1L, 1L, 3L, 1L, 1L, 4L), ScanDirectionFlag = c(1L,
1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L), EdgeOfFlightline = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L
), Classification = c(1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), group = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L)), class = "data.frame", row.names = c(NA,
-20L))
lid static dataset, it always same(always the same values).
I must perform these arithmetic operations
` lidar$row <- round((lidar$A-lid$x1)/lid$x3, 0)
lidar$col <- (lidar$B-lid$x4)/lid$x6
lidar$cdif <- max(lidar$C)-min(lidar$C)
but for each lidar$groups separately.
How can i do it better using dplyr?
Thanks for your help.
library(dplyr)
lidar %>%
group_by(group) %>%
mutate(
row = (A-lid$x1)/lid$x3,
col = (B-lid$x4)/lid$x6,
cdif = max(C)-min(C)
) %>%
ungroup()
# # A tibble: 20 x 14
# A B C gpstime Intensity ReturnNumber NumberOfReturns ScanDirectionFlag EdgeOfFlightline Classification group row col cdif
# <dbl> <dbl> <dbl> <dbl> <int> <int> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl>
# 1 638239. 6078001. 187. 319805735. 13 2 2 1 0 1 1 Inf 1123. 13.4
# 2 638239. 6078001. 187. 319805735. 99 1 1 1 0 2 1 Inf 1123. 13.4
# 3 638239. 6078001. 199. 319805735. 5 1 3 0 0 1 1 Inf 1123. 13.4
# 4 638235. 6078001. 189. 319805735. 2 2 2 0 0 1 1 Inf 1123. 13.4
# 5 638234. 6078001. 187. 319805735. 20 1 1 0 0 1 1 Inf 1123. 13.4
# 6 638234. 6078001. 187. 319805735. 189 1 1 0 0 1 1 Inf 1123. 13.4
# 7 638236. 6078001. 198. 319805735. 2 2 3 1 0 1 1 Inf 1123. 13.4
# 8 638232. 6078001. 200. 319805735. 11 1 1 0 0 1 1 Inf 1123. 13.4
# 9 638232. 6078001. 200. 319805735. 90 1 1 0 0 1 1 Inf 1123. 13.4
# 10 638228. 6078001. 192. 319805735. 1 3 4 0 0 1 1 Inf 1123. 13.4
# 11 638239. 6078001. 187. 319805735. 13 2 2 1 0 1 2 Inf 1123. 99.9
# 12 638239. 6078001. 187. 319805735. 99 1 1 1 0 2 2 Inf 1123. 99.9
# 13 63239. 6078001. 199. 319805735. 5 1 3 0 0 1 2 -Inf 1123. 99.9
# 14 638235. 6078001. 189. 319805735. 2 2 2 0 0 1 2 Inf 1123. 99.9
# 15 638234. 6078001. 197. 319805735. 20 1 1 0 0 1 2 Inf 1123. 99.9
# 16 638234. 6078001. 187. 319805735. 189 1 1 0 0 1 2 Inf 1123. 99.9
# 17 638236. 6078001. 198. 319805735. 2 2 3 1 0 1 2 Inf 1123. 99.9
# 18 638232. 6078001. 200. 319805735. 11 1 1 0 0 1 2 Inf 1123. 99.9
# 19 638232. 6078001. 200. 319805735. 90 1 1 0 0 1 2 Inf 1123. 99.9
# 20 638228. 6078001. 100. 319805735. 1 3 4 0 0 1 2 Inf 1123. 99.9
row is always Inf because lid$x3 is 0. The only part of this that must be grouped is xdif, since it's the only thing that does any groupwise aggregation, the rest can be done ungrouped.
lidar %>%
mutate(
row = (A-lid$x1)/lid$x3,
col = (B-lid$x4)/lid$x6
) %>%
group_by(group) %>%
mutate(cdif = max(C)-min(C)) %>%
ungroup()
Why would one do it this way? With larger datasets or with a lot of groups, it will be more efficient (perhaps perceptibly faster) to do the whole vector at once instead of per-group. The actual calculations should return identical results.

Get new columns based on data from other columns

My data:
data <- structure(list(col1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), col2 = c(0L, 1L, 1L, 0L, 0L,
1L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-18L))
I want to get 2 new columns based on col1 and col2.
column 3 is obtained: We leave units if there is zero in the second column, 2 are simply transferred.
column 4 will turn out: We leave units if there is one in the second column, 2 are simply transferred.
What I want to get:
data <- structure(list(col1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), col2 = c(0L, 1L, 1L, 0L, 0L,
1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), group1 = c(1L,
NA, NA, 1L, 1L, NA, 1L, NA, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), group2 = c(NA, 1L, 1L, NA, NA, 1L, NA, 1L, NA, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L)), class = "data.frame", row.names = c(NA,
-18L))
A solution that uses tidyr::pivot_wider():
library(dplyr)
data %>%
mutate(id = 1:n(), name = paste0("group", col2 + 1), value = 1) %>%
tidyr::pivot_wider() %>%
mutate(col2 = replace(col2, col1 == 2, 0),
across(starts_with("group"), replace, col1 == 2, 2)) %>%
select(-id)
# A tibble: 18 x 4
col1 col2 group1 group2
<int> <dbl> <dbl> <dbl>
1 1 0 1 NA
2 1 1 NA 1
3 1 1 NA 1
4 1 0 1 NA
5 1 0 1 NA
6 1 1 NA 1
7 1 0 1 NA
8 1 1 NA 1
9 1 0 1 NA
10 2 0 2 2
11 2 0 2 2
12 2 0 2 2
13 2 0 2 2
14 2 0 2 2
15 2 0 2 2
16 2 0 2 2
17 2 0 2 2
18 2 0 2 2
You can use ifelse to get group1 and group2.
transform(data
, group1 = ifelse(col1==2, 2, ifelse(col2==0, 1, NA))
, group2 = ifelse(col1==2, 2, ifelse(col2==1, 1, NA))
)
# col1 col2 group1 group2
#1 1 0 1 NA
#2 1 1 NA 1
#3 1 1 NA 1
#4 1 0 1 NA
#5 1 0 1 NA
#6 1 1 NA 1
#7 1 0 1 NA
#8 1 1 NA 1
#9 1 0 1 NA
#10 2 0 2 2
#11 2 1 2 2
#12 2 1 2 2
#13 2 0 2 2
#14 2 0 2 2
#15 2 1 2 2
#16 2 0 2 2
#17 2 1 2 2
#18 2 0 2 2

Error with using mlogit R function: "The two indexes don't define unique observations"

My dataset look like this
ID choice_situation Alternative Attr1 Attr2 Attr3 choice
ID_1 1 1 0 0 0 0
ID_1 1 2 1 1 0 1
ID_1 2 1 1 1 0 0
ID_1 2 2 1 1 1 1
ID_1 3 1 2 1 0 1
ID_1 3 2 3 1 0 0
ID_2 1 1 3 0 1 1
ID_2 1 2 0 0 0 0
ID_2 2 1 2 1 1 0
ID_2 2 2 2 1 1 1
ID_2 3 1 0 0 0 1
ID_2 3 2 0 0 1 0
.....
Every time I run the code of mlogit function
DCE_data<- mlogit.data(data=dataset, choice = "choice", shape = "long", alt.var = "Alternative", id.var = "ID") #ok
model<- mlogit(choice ~ Attr1 + Attr2 + Attr3 | 0, DCE_data)#error
I get the error below :
Error in dfidx(x, .idx, pkg = pkg) :
the two indexes don't define unique observations
The problem is from the transformed data : DCE_data ?
Thanks in advance!
For me your code works:
library(tidyverse)
df <- tibble::tribble(
~ID, ~choice_situation, ~Alternative, ~Attr1, ~Attr2, ~Attr3, ~choice,
"ID_1", 1L, 1L, 0L, 0L, 0L, 0L,
"ID_1", 1L, 2L, 1L, 1L, 0L, 1L,
"ID_1", 2L, 1L, 1L, 1L, 0L, 0L,
"ID_1", 2L, 2L, 1L, 1L, 1L, 1L,
"ID_1", 3L, 1L, 2L, 1L, 0L, 1L,
"ID_1", 3L, 2L, 3L, 1L, 0L, 0L,
"ID_2", 1L, 1L, 3L, 0L, 1L, 1L,
"ID_2", 1L, 2L, 0L, 0L, 0L, 0L,
"ID_2", 2L, 1L, 2L, 1L, 1L, 0L,
"ID_2", 2L, 2L, 2L, 1L, 1L, 1L,
"ID_2", 3L, 1L, 0L, 0L, 0L, 1L,
"ID_2", 3L, 2L, 0L, 0L, 1L, 0L
)
library(mlogit)
DCE_data<- mlogit.data(data=df, choice = "choice", shape = "long", alt.var = "Alternative", id.var = "ID") #ok
model<- mlogit(choice ~ Attr1 + Attr2 + Attr3 | 0, DCE_data)#error
summary(model)
> model
Call:
mlogit(formula = choice ~ Attr1 + Attr2 + Attr3 | 0, data = DCE_data, method = "nr")
Coefficients:
Attr1 Attr2 Attr3
0.34137 14.86152 0.39473

How to determine the longest timeperiod and exclude other rows in Excel or R?

In my dataset I have information of the ZIPCODE of 600K+ ID's. If ID's move to a different addressess, I want to determine at which zipcode they lived the longest and put a '1' for that specific year in that row (no need to combine rows as I want to know if they where they lived in what year). That way an ID only have a '1' for a certain year at one row (if there are multiple rows for that ID). The yellow highlight is what i don't want; in that case there is a '1' in two rows for the same year. In the preferred dataset there is only one '1' per year per ID possible.
For example: ID 4 lived in 2013 in 2 places (NY and LA), therefore there are 2 rows. At this point there is a 1 in each row for 2013 and I only want a 1 in the row the ID lived the longest between 1-1-2013 and 31-12-2018. ID 4 lived in 2013 longer in LA than in NY, and so only a 1 should be at the row for NY (so in this case the row of LA will be removed because only '0's remain).
I can also put this file in RStudio.
Thank you!
structure(v1)
ID CITY ZIPCODE DATE_START DATE_END DATE_END.1 X2013 X2014 X2015 X2016 X2017 X2018
1 1 NY 1234EF 1-12-2003 31-12-2018 1 1 1 1 1 1
2 2 NY 1234CD 1-12-2003 14-1-2019 14-1-2019 1 1 1 1 1 1
3 2 NY 1234AB 15-1-2019 31-12-2018 0 0 0 0 0 0
4 3 NY 1234AB 15-1-2019 31-12-2018 0 0 0 0 0 0
5 3 NY 1234CD 1-12-2003 14-1-2019 14-1-2019 1 1 1 1 1 1
6 4 LA 1111AB 4-5-2013 31-12-2018 1 1 1 1 1 1
7 4 NY 2222AB 1-12-2003 3-5-2013 3-5-2013 1 0 0 0 0 0
8 5 MIAMI 5555CD 6-2-2015 20-6-2016 20-6-2016 0 0 1 1 0 0
9 5 VEGAS 3333AB 1-1-2004 31-12-2018 1 1 1 1 1 1
10 5 ORLANDO 4444AB 26-2-2004 5-2-2015 5-2-2015 1 1 1 0 0 0
11 5 MIAMI 5555AB 21-6-2016 31-12-2018 31-12-2018 0 0 0 1 1 1
12 5 MIAMI 5555AB 1-1-2019 31-12-2018 0 0 0 0 0 0
13 6 AUSTIN 6666AB 28-2-2017 3-11-2017 3-11-2017 0 0 0 0 1 0
14 6 AUSTIN 6666AB 4-11-2017 31-12-2018 0 0 0 0 1 1
15 6 AUSTIN 7777AB 20-1-2017 27-2-2017 27-2-2017 0 0 0 0 1 0
16 6 AUSTIN 8888AB 1-12-2003 19-1-2017 19-1-2017 1 1 1 1 1 0
>
structure(list(ID = c(1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 5L,
5L, 5L, 6L, 6L, 6L, 6L), CITY = structure(c(4L, 4L, 4L, 4L, 4L,
2L, 4L, 3L, 6L, 5L, 3L, 3L, 1L, 1L, 1L, 1L), .Label = c("AUSTIN",
"LA", "MIAMI", "NY", "ORLANDO", "VEGAS"), class = "factor"),
ZIPCODE = structure(c(4L, 3L, 2L, 2L, 3L, 1L, 5L, 9L, 6L,
7L, 8L, 8L, 10L, 10L, 11L, 12L), .Label = c("1111AB", "1234AB",
"1234CD", "1234EF", "2222AB", "3333AB", "4444AB", "5555AB",
"5555CD", "6666AB", "7777AB", "8888AB"), class = "factor"),
DATE_START = structure(c(3L, 3L, 4L, 4L, 3L, 10L, 3L, 11L,
1L, 7L, 6L, 2L, 8L, 9L, 5L, 3L), .Label = c("1-1-2004", "1-1-2019",
"1-12-2003", "15-1-2019", "20-1-2017", "21-6-2016", "26-2-2004",
"28-2-2017", "4-11-2017", "4-5-2013", "6-2-2015"), class = "factor"),
DATE_END = structure(c(1L, 2L, 1L, 1L, 2L, 1L, 7L, 4L, 1L,
9L, 8L, 1L, 6L, 1L, 5L, 3L), .Label = c("", "14-1-2019",
"19-1-2017", "20-6-2016", "27-2-2017", "3-11-2017", "3-5-2013",
"31-12-2018", "5-2-2015"), class = "factor"), DATE_END.1 = structure(c(7L,
1L, 7L, 7L, 1L, 7L, 6L, 3L, 7L, 8L, 7L, 7L, 5L, 7L, 4L, 2L
), .Label = c("14-1-2019", "19-1-2017", "20-6-2016", "27-2-2017",
"3-11-2017", "3-5-2013", "31-12-2018", "5-2-2015"), class = "factor"),
X2013 = c(1L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L,
0L, 0L, 0L, 1L), X2014 = c(1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L,
1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L), X2015 = c(1L, 1L, 0L, 0L,
1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L), X2016 = c(1L,
1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L
), X2017 = c(1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L,
0L, 1L, 1L, 1L, 1L), X2018 = c(1L, 1L, 0L, 0L, 1L, 1L, 0L,
0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L)), class = "data.frame", row.names = c(NA,
-16L))
You can use a little help from the lubridate package to calculate how many days are spent at each location. Then we can group_by ID and use case_when to assign 1 when the time is the max or 0 otherwise.
library(lubridate)
library(dplyr)
v1 %>%
dplyr::select(ID,CITY,ZIPCODE,DATE_START,DATE_END.1) %>%
rowwise() %>%
mutate("X2013" = max(0, min(dmy("31-12-2013"),dmy(DATE_END.1)) - max(dmy("1-1-2013"),dmy(DATE_START))),
"X2014" = max(0, min(dmy("31-12-2014"),dmy(DATE_END.1)) - max(dmy("1-1-2014"),dmy(DATE_START))),
"X2015" = max(0, min(dmy("31-12-2015"),dmy(DATE_END.1)) - max(dmy("1-1-2015"),dmy(DATE_START))),
"X2016" = max(0, min(dmy("31-12-2016"),dmy(DATE_END.1)) - max(dmy("1-1-2016"),dmy(DATE_START))),
"X2017" = max(0, min(dmy("31-12-2017"),dmy(DATE_END.1)) - max(dmy("1-1-2017"),dmy(DATE_START))),
"X2018" = max(0, min(dmy("31-12-2018"),dmy(DATE_END.1)) - max(dmy("1-1-2018"),dmy(DATE_START)))) %>%
ungroup %>%
group_by(ID) %>%
mutate_at(vars(starts_with("X")),list(~ case_when(. == max(.) ~ 1,
TRUE ~ 0)))
# A tibble: 16 x 11
# Groups: ID [6]
ID CITY ZIPCODE DATE_START DATE_END.1 X2013 X2014 X2015 X2016 X2017 X2018
<int> <fct> <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 NY 1234EF 1-12-2003 31-12-2018 1 1 1 1 1 1
2 2 NY 1234CD 1-12-2003 14-1-2019 1 1 1 1 1 1
3 2 NY 1234AB 15-1-2019 31-12-2018 0 0 0 0 0 0
4 3 NY 1234AB 15-1-2019 31-12-2018 0 0 0 0 0 0
5 3 NY 1234CD 1-12-2003 14-1-2019 1 1 1 1 1 1
6 4 LA 1111AB 4-5-2013 31-12-2018 1 1 1 1 1 1
7 4 NY 2222AB 1-12-2003 3-5-2013 0 0 0 0 0 0
8 5 MIAMI 5555CD 6-2-2015 20-6-2016 0 0 0 0 0 0
9 5 VEGAS 3333AB 1-1-2004 31-12-2018 1 1 1 1 1 1
10 5 ORLANDO 4444AB 26-2-2004 5-2-2015 1 1 0 0 0 0
11 5 MIAMI 5555AB 21-6-2016 31-12-2018 0 0 0 0 1 1
12 5 MIAMI 5555AB 1-1-2019 31-12-2018 0 0 0 0 0 0
13 6 AUSTIN 6666AB 28-2-2017 3-11-2017 0 0 0 0 1 0
14 6 AUSTIN 6666AB 4-11-2017 31-12-2018 0 0 0 0 0 1
15 6 AUSTIN 7777AB 20-1-2017 27-2-2017 0 0 0 0 0 0
16 6 AUSTIN 8888AB 1-12-2003 19-1-2017 1 1 1 1 0 0
There is certainly a way that one could implement the first mutate call to not require manually writing each year, but would take much more work than just typing it out.

aggregation of data in R with assign of a dummy variable by condition

I have the following dataset
mydata=structure(list(id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), ad_id = c(111L, 111L, 111L,
111L, 1111L, 1111L, 11111L, 11111L, 11111L, 111L, 111L, 1111L,
1111L, 11111L, 11111L, 11111L, 111111L, 111111L), price = c(1L,
0L, 1L, 0L, 2L, 0L, 3L, 0L, 0L, 1L, 0L, 2L, 0L, 3L, 0L, 0L, 1L,
0L), rev = c(2L, 0L, 0L, 2L, 3L, 3L, 4L, 4L, 4L, 2L, 2L, 3L,
3L, 4L, 4L, 4L, 0L, 0L), data = structure(c(1L, 2L, 2L, 3L, 1L,
3L, 1L, 3L, 4L, 1L, 3L, 1L, 3L, 1L, 3L, 4L, 1L, 3L), .Label = c("01.01.2018",
"01.02.2018", "01.03.2018", "02.03.2018"), class = "factor")), .Names = c("id",
"ad_id", "price", "rev", "data"), class = "data.frame", row.names = c(NA,
-18L))
How can I create a dummy variable according to the following logic:
For each id and ad_id I need to aggregate by data price and rev. Each ad_id has a date column (data).
If for each id and ad_idfor the period up to 90 days(data column -d-m-y) rev is greater than the price, then the flag is set to 1 otherwise the flag is 0.
In this reproducible example , I just take 1 id and 4 ad_id.
In aggregated by sum form it is view
id ad_id price rev
1 1 111 2 4
2 1 1111 2 6
3 1 11111 3 12
4 1 111111 1 0
So for id=1 , all ad_id (besides ad_id = 111111) satisfy rev > price, so in initial data
ad_id = 111, 1111, 111111 must have flag = 1 and 111111 must have flag = 0.
Here is the desired output:
id ad_id price rev data flag
1 1 111 1 2 01.01.2018 1
2 1 111 0 0 01.02.2018 1
3 1 111 1 0 01.02.2018 1
4 1 111 0 2 01.03.2018 1
5 1 1111 2 3 01.01.2018 1
6 1 1111 0 3 01.03.2018 1
7 1 11111 3 4 01.01.2018 1
8 1 11111 0 4 01.03.2018 1
9 1 11111 0 4 02.03.2018 1
10 1 111111 1 0 01.01.2018 0
11 1 111111 0 0 01.03.2018 0
How to perform such condition
I am not sure if understood you correctly, but is this what you are looking for:
library(tidyverse)
mydata %>% as_tibble() %>%
group_by(id, ad_id) %>%
summarise_at(vars("price", "rev"), sum) %>%
mutate(flag = if_else(price > rev, 0, 1)) %>%
select(id, ad_id, flag) %>%
left_join(mydata, ., by = c("id", "ad_id"))

Resources