I need to count zeros by ID in R, but I need them just the zeros before a different Value.
Here's a example of my data.
x<-tibble::tribble(
~ID, ~Date, ~Value,
1L, "01-01-2016", 0L,
1L, "01-02-2016", 0L,
1L, "01-03-2016", 0L,
1L, "01-04-2016", 0L,
1L, "01-05-2016", 1000L,
1L, "01-06-2016", 0L,
2L, "01-01-2016", 0L,
2L, "01-02-2016", 0L,
2L, "01-03-2016", 1500L,
2L, "01-04-2016", 0L,
2L, "01-05-2016", 0L,
2L, "01-06-2016", 0L,
3L, "01-01-2016", 0L,
3L, "01-02-2016", -300L,
3L, "01-03-2016", 0L,
3L, "01-04-2016", 0L,
3L, "01-05-2016", 200L,
3L, "01-06-2016", 0L
)
x<-as.data.frame(x)
head(x)
# ID Date Value
# 1 1 01-01-2016 0
# 2 1 01-02-2016 0
# 3 1 01-03-2016 0
# 4 1 01-04-2016 0
# 5 1 01-05-2016 1000
# 6 1 01-06-2016 0
What I'm looking it's something like this
ID 1 2 3
Count 4 2 1
Because there's 4 zeros before a different value in ID 1, 2 zeros in ID 2 and 1 zero in ID 3. Also, I'd like to save it in the dataframe as it follows:
new_x<-tibble::tribble(
~ID, ~Date, ~Value, ~Count,
1L, "01-01-2016", 0L, 4L,
1L, "01-02-2016", 0L, 4L,
1L, "01-03-2016", 0L, 4L,
1L, "01-04-2016", 0L, 4L,
1L, "01-05-2016", 1000L, 4L,
1L, "01-06-2016", 0L, 4L,
2L, "01-01-2016", 0L, 2L,
2L, "01-02-2016", 0L, 2L,
2L, "01-03-2016", 1500L, 2L,
2L, "01-04-2016", 0L, 2L,
2L, "01-05-2016", 0L, 2L,
2L, "01-06-2016", 0L, 2L,
3L, "01-01-2016", 0L, 1L,
3L, "01-02-2016", -300L, 1L,
3L, "01-03-2016", 0L, 1L,
3L, "01-04-2016", 0L, 1L,
3L, "01-05-2016", 200L, 1L,
3L, "01-06-2016", 0L, 1L
)
Does anyone how to solve it? Thanks!
One way would be:
library(tidyverse)
x %>%
group_by(ID) %>%
summarise(n = sum(cumsum(Value != 0) == 0, na.rm = TRUE)) %>%
pivot_wider(names_from = ID, values_from = n) %>%
add_column(ID = 'Count', .before = 1) %>%
as.data.frame
Output:
ID 1 2 3
1 Count 4 2 1
For saving as column:
x %>%
group_by(ID) %>%
mutate(Count = sum(cumsum(Value != 0) == 0, na.rm = TRUE))
There's of course also no need for packages:
transform(x, Count = ave(Value, ID, FUN = function(x) sum(cumsum(x != 0) == 0, na.rm = TRUE)))
Here's another way to do it using purrr's detect_index:
library(purrr)
library(dplyr)
x %>%
group_by(ID) %>%
mutate(Count = detect_index(Value, ~ .x != 0) - 1)
#> # A tibble: 18 x 4
#> # Groups: ID [3]
#> ID Date Value Count
#> <int> <chr> <int> <dbl>
#> 1 1 01-01-2016 0 4
#> 2 1 01-02-2016 0 4
#> 3 1 01-03-2016 0 4
#> 4 1 01-04-2016 0 4
#> 5 1 01-05-2016 1000 4
#> 6 1 01-06-2016 0 4
#> 7 2 01-01-2016 0 2
#> 8 2 01-02-2016 0 2
#> 9 2 01-03-2016 1500 2
#> 10 2 01-04-2016 0 2
#> 11 2 01-05-2016 0 2
#> 12 2 01-06-2016 0 2
#> 13 3 01-01-2016 0 1
#> 14 3 01-02-2016 -300 1
#> 15 3 01-03-2016 0 1
#> 16 3 01-04-2016 0 1
#> 17 3 01-05-2016 200 1
#> 18 3 01-06-2016 0 1
Related
have several datasets. The first one
lid=structure(list(x1 = 619490L, x2 = 10L, x3 = 0L, x4 = 6089230L,
x5 = 0L, x6 = -10L), class = "data.frame", row.names = c(NA,
-1L))
second dataset
lidar=structure(list(A = c(638238.76, 638238.76, 638239.29, 638235.39,
638233.86, 638233.86, 638235.55, 638231.97, 638231.91, 638228.41,
638238.76, 638238.76, 63239.29, 638235.39, 638233.86, 638233.86,
638235.55, 638231.97, 638231.91, 638228.41), B = c(6078001.09,
6078001.09, 6078001.15, 6078001.15, 6078001.07, 6078001.07, 6078001.02,
6078001.08, 6078001.09, 6078001.01, 6078001.09, 6078001.09, 6078001.15,
6078001.15, 6078001.07, 6078001.07, 6078001.02, 6078001.08, 6078001.09,
6078001.01), C = c(186.64, 186.59, 199.28, 189.37, 186.67, 186.67,
198.04, 200.03, 199.73, 192.14, 186.64, 186.59, 199.28, 189.37,
196.67, 186.67, 198.04, 200.03, 199.73, 100.14), gpstime = c(319805734.664265,
319805734.664265, 319805734.67875, 319805734.678768, 319805734.678777,
319805734.678777, 319805734.687338, 319805734.701928, 319805734.701928,
319805734.701945, 319805734.664265, 319805734.664265, 319805734.67875,
319805734.678768, 319805734.678777, 319805734.678777, 319805734.687338,
319805734.701928, 319805734.701928, 319805734.701945), Intensity = c(13L,
99L, 5L, 2L, 20L, 189L, 2L, 11L, 90L, 1L, 13L, 99L, 5L, 2L, 20L,
189L, 2L, 11L, 90L, 1L), ReturnNumber = c(2L, 1L, 1L, 2L, 1L,
1L, 2L, 1L, 1L, 3L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 3L),
NumberOfReturns = c(2L, 1L, 3L, 2L, 1L, 1L, 3L, 1L, 1L, 4L,
2L, 1L, 3L, 2L, 1L, 1L, 3L, 1L, 1L, 4L), ScanDirectionFlag = c(1L,
1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L), EdgeOfFlightline = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L
), Classification = c(1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), group = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L)), class = "data.frame", row.names = c(NA,
-20L))
lid static dataset, it always same(always the same values).
I must perform these arithmetic operations
` lidar$row <- round((lidar$A-lid$x1)/lid$x3, 0)
lidar$col <- (lidar$B-lid$x4)/lid$x6
lidar$cdif <- max(lidar$C)-min(lidar$C)
but for each lidar$groups separately.
How can i do it better using dplyr?
Thanks for your help.
library(dplyr)
lidar %>%
group_by(group) %>%
mutate(
row = (A-lid$x1)/lid$x3,
col = (B-lid$x4)/lid$x6,
cdif = max(C)-min(C)
) %>%
ungroup()
# # A tibble: 20 x 14
# A B C gpstime Intensity ReturnNumber NumberOfReturns ScanDirectionFlag EdgeOfFlightline Classification group row col cdif
# <dbl> <dbl> <dbl> <dbl> <int> <int> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl>
# 1 638239. 6078001. 187. 319805735. 13 2 2 1 0 1 1 Inf 1123. 13.4
# 2 638239. 6078001. 187. 319805735. 99 1 1 1 0 2 1 Inf 1123. 13.4
# 3 638239. 6078001. 199. 319805735. 5 1 3 0 0 1 1 Inf 1123. 13.4
# 4 638235. 6078001. 189. 319805735. 2 2 2 0 0 1 1 Inf 1123. 13.4
# 5 638234. 6078001. 187. 319805735. 20 1 1 0 0 1 1 Inf 1123. 13.4
# 6 638234. 6078001. 187. 319805735. 189 1 1 0 0 1 1 Inf 1123. 13.4
# 7 638236. 6078001. 198. 319805735. 2 2 3 1 0 1 1 Inf 1123. 13.4
# 8 638232. 6078001. 200. 319805735. 11 1 1 0 0 1 1 Inf 1123. 13.4
# 9 638232. 6078001. 200. 319805735. 90 1 1 0 0 1 1 Inf 1123. 13.4
# 10 638228. 6078001. 192. 319805735. 1 3 4 0 0 1 1 Inf 1123. 13.4
# 11 638239. 6078001. 187. 319805735. 13 2 2 1 0 1 2 Inf 1123. 99.9
# 12 638239. 6078001. 187. 319805735. 99 1 1 1 0 2 2 Inf 1123. 99.9
# 13 63239. 6078001. 199. 319805735. 5 1 3 0 0 1 2 -Inf 1123. 99.9
# 14 638235. 6078001. 189. 319805735. 2 2 2 0 0 1 2 Inf 1123. 99.9
# 15 638234. 6078001. 197. 319805735. 20 1 1 0 0 1 2 Inf 1123. 99.9
# 16 638234. 6078001. 187. 319805735. 189 1 1 0 0 1 2 Inf 1123. 99.9
# 17 638236. 6078001. 198. 319805735. 2 2 3 1 0 1 2 Inf 1123. 99.9
# 18 638232. 6078001. 200. 319805735. 11 1 1 0 0 1 2 Inf 1123. 99.9
# 19 638232. 6078001. 200. 319805735. 90 1 1 0 0 1 2 Inf 1123. 99.9
# 20 638228. 6078001. 100. 319805735. 1 3 4 0 0 1 2 Inf 1123. 99.9
row is always Inf because lid$x3 is 0. The only part of this that must be grouped is xdif, since it's the only thing that does any groupwise aggregation, the rest can be done ungrouped.
lidar %>%
mutate(
row = (A-lid$x1)/lid$x3,
col = (B-lid$x4)/lid$x6
) %>%
group_by(group) %>%
mutate(cdif = max(C)-min(C)) %>%
ungroup()
Why would one do it this way? With larger datasets or with a lot of groups, it will be more efficient (perhaps perceptibly faster) to do the whole vector at once instead of per-group. The actual calculations should return identical results.
My data:
data <- structure(list(col1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), col2 = c(0L, 1L, 1L, 0L, 0L,
1L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-18L))
I want to get 2 new columns based on col1 and col2.
column 3 is obtained: We leave units if there is zero in the second column, 2 are simply transferred.
column 4 will turn out: We leave units if there is one in the second column, 2 are simply transferred.
What I want to get:
data <- structure(list(col1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), col2 = c(0L, 1L, 1L, 0L, 0L,
1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), group1 = c(1L,
NA, NA, 1L, 1L, NA, 1L, NA, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), group2 = c(NA, 1L, 1L, NA, NA, 1L, NA, 1L, NA, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L)), class = "data.frame", row.names = c(NA,
-18L))
A solution that uses tidyr::pivot_wider():
library(dplyr)
data %>%
mutate(id = 1:n(), name = paste0("group", col2 + 1), value = 1) %>%
tidyr::pivot_wider() %>%
mutate(col2 = replace(col2, col1 == 2, 0),
across(starts_with("group"), replace, col1 == 2, 2)) %>%
select(-id)
# A tibble: 18 x 4
col1 col2 group1 group2
<int> <dbl> <dbl> <dbl>
1 1 0 1 NA
2 1 1 NA 1
3 1 1 NA 1
4 1 0 1 NA
5 1 0 1 NA
6 1 1 NA 1
7 1 0 1 NA
8 1 1 NA 1
9 1 0 1 NA
10 2 0 2 2
11 2 0 2 2
12 2 0 2 2
13 2 0 2 2
14 2 0 2 2
15 2 0 2 2
16 2 0 2 2
17 2 0 2 2
18 2 0 2 2
You can use ifelse to get group1 and group2.
transform(data
, group1 = ifelse(col1==2, 2, ifelse(col2==0, 1, NA))
, group2 = ifelse(col1==2, 2, ifelse(col2==1, 1, NA))
)
# col1 col2 group1 group2
#1 1 0 1 NA
#2 1 1 NA 1
#3 1 1 NA 1
#4 1 0 1 NA
#5 1 0 1 NA
#6 1 1 NA 1
#7 1 0 1 NA
#8 1 1 NA 1
#9 1 0 1 NA
#10 2 0 2 2
#11 2 1 2 2
#12 2 1 2 2
#13 2 0 2 2
#14 2 0 2 2
#15 2 1 2 2
#16 2 0 2 2
#17 2 1 2 2
#18 2 0 2 2
My dataset look like this
ID choice_situation Alternative Attr1 Attr2 Attr3 choice
ID_1 1 1 0 0 0 0
ID_1 1 2 1 1 0 1
ID_1 2 1 1 1 0 0
ID_1 2 2 1 1 1 1
ID_1 3 1 2 1 0 1
ID_1 3 2 3 1 0 0
ID_2 1 1 3 0 1 1
ID_2 1 2 0 0 0 0
ID_2 2 1 2 1 1 0
ID_2 2 2 2 1 1 1
ID_2 3 1 0 0 0 1
ID_2 3 2 0 0 1 0
.....
Every time I run the code of mlogit function
DCE_data<- mlogit.data(data=dataset, choice = "choice", shape = "long", alt.var = "Alternative", id.var = "ID") #ok
model<- mlogit(choice ~ Attr1 + Attr2 + Attr3 | 0, DCE_data)#error
I get the error below :
Error in dfidx(x, .idx, pkg = pkg) :
the two indexes don't define unique observations
The problem is from the transformed data : DCE_data ?
Thanks in advance!
For me your code works:
library(tidyverse)
df <- tibble::tribble(
~ID, ~choice_situation, ~Alternative, ~Attr1, ~Attr2, ~Attr3, ~choice,
"ID_1", 1L, 1L, 0L, 0L, 0L, 0L,
"ID_1", 1L, 2L, 1L, 1L, 0L, 1L,
"ID_1", 2L, 1L, 1L, 1L, 0L, 0L,
"ID_1", 2L, 2L, 1L, 1L, 1L, 1L,
"ID_1", 3L, 1L, 2L, 1L, 0L, 1L,
"ID_1", 3L, 2L, 3L, 1L, 0L, 0L,
"ID_2", 1L, 1L, 3L, 0L, 1L, 1L,
"ID_2", 1L, 2L, 0L, 0L, 0L, 0L,
"ID_2", 2L, 1L, 2L, 1L, 1L, 0L,
"ID_2", 2L, 2L, 2L, 1L, 1L, 1L,
"ID_2", 3L, 1L, 0L, 0L, 0L, 1L,
"ID_2", 3L, 2L, 0L, 0L, 1L, 0L
)
library(mlogit)
DCE_data<- mlogit.data(data=df, choice = "choice", shape = "long", alt.var = "Alternative", id.var = "ID") #ok
model<- mlogit(choice ~ Attr1 + Attr2 + Attr3 | 0, DCE_data)#error
summary(model)
> model
Call:
mlogit(formula = choice ~ Attr1 + Attr2 + Attr3 | 0, data = DCE_data, method = "nr")
Coefficients:
Attr1 Attr2 Attr3
0.34137 14.86152 0.39473
In my dataset I have information of the ZIPCODE of 600K+ ID's. If ID's move to a different addressess, I want to determine at which zipcode they lived the longest and put a '1' for that specific year in that row (no need to combine rows as I want to know if they where they lived in what year). That way an ID only have a '1' for a certain year at one row (if there are multiple rows for that ID). The yellow highlight is what i don't want; in that case there is a '1' in two rows for the same year. In the preferred dataset there is only one '1' per year per ID possible.
For example: ID 4 lived in 2013 in 2 places (NY and LA), therefore there are 2 rows. At this point there is a 1 in each row for 2013 and I only want a 1 in the row the ID lived the longest between 1-1-2013 and 31-12-2018. ID 4 lived in 2013 longer in LA than in NY, and so only a 1 should be at the row for NY (so in this case the row of LA will be removed because only '0's remain).
I can also put this file in RStudio.
Thank you!
structure(v1)
ID CITY ZIPCODE DATE_START DATE_END DATE_END.1 X2013 X2014 X2015 X2016 X2017 X2018
1 1 NY 1234EF 1-12-2003 31-12-2018 1 1 1 1 1 1
2 2 NY 1234CD 1-12-2003 14-1-2019 14-1-2019 1 1 1 1 1 1
3 2 NY 1234AB 15-1-2019 31-12-2018 0 0 0 0 0 0
4 3 NY 1234AB 15-1-2019 31-12-2018 0 0 0 0 0 0
5 3 NY 1234CD 1-12-2003 14-1-2019 14-1-2019 1 1 1 1 1 1
6 4 LA 1111AB 4-5-2013 31-12-2018 1 1 1 1 1 1
7 4 NY 2222AB 1-12-2003 3-5-2013 3-5-2013 1 0 0 0 0 0
8 5 MIAMI 5555CD 6-2-2015 20-6-2016 20-6-2016 0 0 1 1 0 0
9 5 VEGAS 3333AB 1-1-2004 31-12-2018 1 1 1 1 1 1
10 5 ORLANDO 4444AB 26-2-2004 5-2-2015 5-2-2015 1 1 1 0 0 0
11 5 MIAMI 5555AB 21-6-2016 31-12-2018 31-12-2018 0 0 0 1 1 1
12 5 MIAMI 5555AB 1-1-2019 31-12-2018 0 0 0 0 0 0
13 6 AUSTIN 6666AB 28-2-2017 3-11-2017 3-11-2017 0 0 0 0 1 0
14 6 AUSTIN 6666AB 4-11-2017 31-12-2018 0 0 0 0 1 1
15 6 AUSTIN 7777AB 20-1-2017 27-2-2017 27-2-2017 0 0 0 0 1 0
16 6 AUSTIN 8888AB 1-12-2003 19-1-2017 19-1-2017 1 1 1 1 1 0
>
structure(list(ID = c(1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 5L,
5L, 5L, 6L, 6L, 6L, 6L), CITY = structure(c(4L, 4L, 4L, 4L, 4L,
2L, 4L, 3L, 6L, 5L, 3L, 3L, 1L, 1L, 1L, 1L), .Label = c("AUSTIN",
"LA", "MIAMI", "NY", "ORLANDO", "VEGAS"), class = "factor"),
ZIPCODE = structure(c(4L, 3L, 2L, 2L, 3L, 1L, 5L, 9L, 6L,
7L, 8L, 8L, 10L, 10L, 11L, 12L), .Label = c("1111AB", "1234AB",
"1234CD", "1234EF", "2222AB", "3333AB", "4444AB", "5555AB",
"5555CD", "6666AB", "7777AB", "8888AB"), class = "factor"),
DATE_START = structure(c(3L, 3L, 4L, 4L, 3L, 10L, 3L, 11L,
1L, 7L, 6L, 2L, 8L, 9L, 5L, 3L), .Label = c("1-1-2004", "1-1-2019",
"1-12-2003", "15-1-2019", "20-1-2017", "21-6-2016", "26-2-2004",
"28-2-2017", "4-11-2017", "4-5-2013", "6-2-2015"), class = "factor"),
DATE_END = structure(c(1L, 2L, 1L, 1L, 2L, 1L, 7L, 4L, 1L,
9L, 8L, 1L, 6L, 1L, 5L, 3L), .Label = c("", "14-1-2019",
"19-1-2017", "20-6-2016", "27-2-2017", "3-11-2017", "3-5-2013",
"31-12-2018", "5-2-2015"), class = "factor"), DATE_END.1 = structure(c(7L,
1L, 7L, 7L, 1L, 7L, 6L, 3L, 7L, 8L, 7L, 7L, 5L, 7L, 4L, 2L
), .Label = c("14-1-2019", "19-1-2017", "20-6-2016", "27-2-2017",
"3-11-2017", "3-5-2013", "31-12-2018", "5-2-2015"), class = "factor"),
X2013 = c(1L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L,
0L, 0L, 0L, 1L), X2014 = c(1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L,
1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L), X2015 = c(1L, 1L, 0L, 0L,
1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L), X2016 = c(1L,
1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L
), X2017 = c(1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L,
0L, 1L, 1L, 1L, 1L), X2018 = c(1L, 1L, 0L, 0L, 1L, 1L, 0L,
0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L)), class = "data.frame", row.names = c(NA,
-16L))
You can use a little help from the lubridate package to calculate how many days are spent at each location. Then we can group_by ID and use case_when to assign 1 when the time is the max or 0 otherwise.
library(lubridate)
library(dplyr)
v1 %>%
dplyr::select(ID,CITY,ZIPCODE,DATE_START,DATE_END.1) %>%
rowwise() %>%
mutate("X2013" = max(0, min(dmy("31-12-2013"),dmy(DATE_END.1)) - max(dmy("1-1-2013"),dmy(DATE_START))),
"X2014" = max(0, min(dmy("31-12-2014"),dmy(DATE_END.1)) - max(dmy("1-1-2014"),dmy(DATE_START))),
"X2015" = max(0, min(dmy("31-12-2015"),dmy(DATE_END.1)) - max(dmy("1-1-2015"),dmy(DATE_START))),
"X2016" = max(0, min(dmy("31-12-2016"),dmy(DATE_END.1)) - max(dmy("1-1-2016"),dmy(DATE_START))),
"X2017" = max(0, min(dmy("31-12-2017"),dmy(DATE_END.1)) - max(dmy("1-1-2017"),dmy(DATE_START))),
"X2018" = max(0, min(dmy("31-12-2018"),dmy(DATE_END.1)) - max(dmy("1-1-2018"),dmy(DATE_START)))) %>%
ungroup %>%
group_by(ID) %>%
mutate_at(vars(starts_with("X")),list(~ case_when(. == max(.) ~ 1,
TRUE ~ 0)))
# A tibble: 16 x 11
# Groups: ID [6]
ID CITY ZIPCODE DATE_START DATE_END.1 X2013 X2014 X2015 X2016 X2017 X2018
<int> <fct> <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 NY 1234EF 1-12-2003 31-12-2018 1 1 1 1 1 1
2 2 NY 1234CD 1-12-2003 14-1-2019 1 1 1 1 1 1
3 2 NY 1234AB 15-1-2019 31-12-2018 0 0 0 0 0 0
4 3 NY 1234AB 15-1-2019 31-12-2018 0 0 0 0 0 0
5 3 NY 1234CD 1-12-2003 14-1-2019 1 1 1 1 1 1
6 4 LA 1111AB 4-5-2013 31-12-2018 1 1 1 1 1 1
7 4 NY 2222AB 1-12-2003 3-5-2013 0 0 0 0 0 0
8 5 MIAMI 5555CD 6-2-2015 20-6-2016 0 0 0 0 0 0
9 5 VEGAS 3333AB 1-1-2004 31-12-2018 1 1 1 1 1 1
10 5 ORLANDO 4444AB 26-2-2004 5-2-2015 1 1 0 0 0 0
11 5 MIAMI 5555AB 21-6-2016 31-12-2018 0 0 0 0 1 1
12 5 MIAMI 5555AB 1-1-2019 31-12-2018 0 0 0 0 0 0
13 6 AUSTIN 6666AB 28-2-2017 3-11-2017 0 0 0 0 1 0
14 6 AUSTIN 6666AB 4-11-2017 31-12-2018 0 0 0 0 0 1
15 6 AUSTIN 7777AB 20-1-2017 27-2-2017 0 0 0 0 0 0
16 6 AUSTIN 8888AB 1-12-2003 19-1-2017 1 1 1 1 0 0
There is certainly a way that one could implement the first mutate call to not require manually writing each year, but would take much more work than just typing it out.
I have the following dataset
mydata=structure(list(id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), ad_id = c(111L, 111L, 111L,
111L, 1111L, 1111L, 11111L, 11111L, 11111L, 111L, 111L, 1111L,
1111L, 11111L, 11111L, 11111L, 111111L, 111111L), price = c(1L,
0L, 1L, 0L, 2L, 0L, 3L, 0L, 0L, 1L, 0L, 2L, 0L, 3L, 0L, 0L, 1L,
0L), rev = c(2L, 0L, 0L, 2L, 3L, 3L, 4L, 4L, 4L, 2L, 2L, 3L,
3L, 4L, 4L, 4L, 0L, 0L), data = structure(c(1L, 2L, 2L, 3L, 1L,
3L, 1L, 3L, 4L, 1L, 3L, 1L, 3L, 1L, 3L, 4L, 1L, 3L), .Label = c("01.01.2018",
"01.02.2018", "01.03.2018", "02.03.2018"), class = "factor")), .Names = c("id",
"ad_id", "price", "rev", "data"), class = "data.frame", row.names = c(NA,
-18L))
How can I create a dummy variable according to the following logic:
For each id and ad_id I need to aggregate by data price and rev. Each ad_id has a date column (data).
If for each id and ad_idfor the period up to 90 days(data column -d-m-y) rev is greater than the price, then the flag is set to 1 otherwise the flag is 0.
In this reproducible example , I just take 1 id and 4 ad_id.
In aggregated by sum form it is view
id ad_id price rev
1 1 111 2 4
2 1 1111 2 6
3 1 11111 3 12
4 1 111111 1 0
So for id=1 , all ad_id (besides ad_id = 111111) satisfy rev > price, so in initial data
ad_id = 111, 1111, 111111 must have flag = 1 and 111111 must have flag = 0.
Here is the desired output:
id ad_id price rev data flag
1 1 111 1 2 01.01.2018 1
2 1 111 0 0 01.02.2018 1
3 1 111 1 0 01.02.2018 1
4 1 111 0 2 01.03.2018 1
5 1 1111 2 3 01.01.2018 1
6 1 1111 0 3 01.03.2018 1
7 1 11111 3 4 01.01.2018 1
8 1 11111 0 4 01.03.2018 1
9 1 11111 0 4 02.03.2018 1
10 1 111111 1 0 01.01.2018 0
11 1 111111 0 0 01.03.2018 0
How to perform such condition
I am not sure if understood you correctly, but is this what you are looking for:
library(tidyverse)
mydata %>% as_tibble() %>%
group_by(id, ad_id) %>%
summarise_at(vars("price", "rev"), sum) %>%
mutate(flag = if_else(price > rev, 0, 1)) %>%
select(id, ad_id, flag) %>%
left_join(mydata, ., by = c("id", "ad_id"))