I have a base with the variables ID, month (or period) and the incomes of that month. What I need is to put a 1 if the client buys in the next 3 months or a 0 if not, and do it for all ID.
For example, if I am in month 1 and there's a purchase in the next 3 months, then put a 1 in that row for that client.
In the last periods as there will not be 3 months, an NA appears.
df<-tibble::tribble(
~ID, ~Month, ~Incomes,
1L, 1L, 5000L,
1L, 2L, 0L,
1L, 3L, 0L,
1L, 4L, 0L,
1L, 5L, 0L,
1L, 6L, 0L,
1L, 7L, 400L,
1L, 8L, 300L,
1L, 9L, 0L,
1L, 10L, 0L,
1L, 11L, 0L,
1L, 12L, 0L,
1L, 13L, 400L,
2L, 1L, 0L,
2L, 2L, 100L,
2L, 3L, 0L,
2L, 4L, 0L,
2L, 5L, 0L,
2L, 6L, 0L,
2L, 7L, 0L,
2L, 8L, 1500L,
2L, 9L, 0L,
2L, 10L, 0L,
2L, 11L, 0L,
2L, 12L, 100L,
2L, 13L, 750L,
3L, 1L, 0L,
3L, 2L, 0L,
3L, 3L, 0L,
3L, 4L, 0L,
3L, 5L, 700L,
3L, 6L, 240L,
3L, 7L, 100L,
3L, 8L, 0L,
3L, 9L, 0L,
3L, 10L, 0L,
3L, 11L, 0L,
3L, 12L, 500L,
3L, 13L, 760L
)
df<-as.data.frame(df)
# ID Month Incomes
# 1 1 5000
# 1 2 0
# 1 3 0
# 1 4 0
# 1 5 0
# 1 6 0
# 1 7 400
# 1 8 300
# 1 9 0
# 1 10 0
# 1 11 0
# 1 12 0
# 1 13 400
# 2 1 0
# 2 2 100
# 2 3 0
# 2 4 0
# 2 5 0
# 2 6 0
# 2 7 0
# 2 8 1500
# 2 9 0
# 2 10 0
# 2 11 0
# 2 12 100
# 2 13 750
# 3 1 0
# 3 2 0
# 3 3 0
# 3 4 0
# 3 5 700
# 3 6 240
# 3 7 100
# 3 8 0
# 3 9 0
# 3 10 0
# 3 11 0
# 3 12 500
# 3 13 760
What I hope should look like this:
dffinal<- tibble::tribble(
~ID_RUT, ~Month, ~Incomes, ~Quarter,
1L, 1L, 5000L, 0L,
1L, 2L, 0L, 0L,
1L, 3L, 0L, 0L,
1L, 4L, 0L, 1L,
1L, 5L, 0L, 1L,
1L, 6L, 0L, 1L,
1L, 7L, 400L, 1L,
1L, 8L, 300L, 0L,
1L, 9L, 0L, 0L,
1L, 10L, 0L, 0L,
1L, 11L, 0L, NA,
1L, 12L, 0L, NA,
1L, 13L, 400L, NA,
2L, 1L, 0L, 1L,
2L, 2L, 100L, 0L,
2L, 3L, 0L, 0L,
2L, 4L, 0L, 0L,
2L, 5L, 0L, 1L,
2L, 6L, 0L, 1L,
2L, 7L, 0L, 1L,
2L, 8L, 1500L, 0L,
2L, 9L, 0L, 1L,
2L, 10L, 0L, 1L,
2L, 11L, 0L, NA,
2L, 12L, 100L, NA,
2L, 13L, 750L, NA,
3L, 1L, 0L, 0L,
3L, 2L, 0L, 1L,
3L, 3L, 0L, 1L,
3L, 4L, 0L, 1L,
3L, 5L, 700L, 1L,
3L, 6L, 240L, 1L,
3L, 7L, 100L, 0L,
3L, 8L, 0L, 0L,
3L, 9L, 0L, 1L,
3L, 10L, 0L, 1L,
3L, 11L, 0L, NA,
3L, 12L, 500L, NA,
3L, 13L, 760L, NA
)
# ID Month Incomes Quarterly
# 1 1 5000 0
# 1 2 0 0
# 1 3 0 0
# 1 4 0 1
# 1 5 0 1
# 1 6 0 1
# 1 7 400 1
# 1 8 300 0
# 1 9 0 0
# 1 10 0 0
# 1 11 0 NA
# 1 12 0 NA
# 1 13 400 NA
# 2 1 0 1
# 2 2 100 0
# 2 3 0 0
# 2 4 0 0
# 2 5 0 1
# 2 6 0 1
# 2 7 0 1
# 2 8 1500 0
# 2 9 0 1
# 2 10 0 1
# 2 11 0 NA
# 2 12 100 NA
# 2 13 750 NA
# 3 1 0 0
# 3 2 0 1
# 3 3 0 1
# 3 4 0 1
# 3 5 700 1
# 3 6 240 1
# 3 7 100 0
# 3 8 0 0
# 3 9 0 1
# 3 10 0 1
# 3 11 0 NA
# 3 12 500 NA
# 3 13 760 NA
Does anyone how to do it? Thanks for your time
1) rollapply Roll forward along Incomes > 0 returning TRUE if any are TRUE and FALSE otherwise. Convert that to numeric using +. 1:3 means use offsets 1, 2, 3 from the current point, i.e. the next three incomes. Add the partial=TRUE argument to rollapply if you want to consider the next and next two incomes near the end of each group where there are not three left.
library(dplyr)
library(zoo)
df %>%
group_by(ID) %>%
mutate(Quarter = +rollapply(Incomes > 0, list(1:3), any, fill = NA)) %>%
ungroup
2) SQL An SQL solution would be:
library(sqldf)
over <- "partition by ID rows between 1 following and 3 following"
fn$sqldf("select
*,
(max(Incomes > 0) over ($over)) +
(case when (count(*) over ($over)) = 3 then 0 else Null end) as Quarter
from df")
This can be simplified if it is OK to process elements for which there are fewer than 3 rows following. over is from above:
fn$sqldf("select *, (max(Incomes > 0) over ($over)) as Quarter from df")
A dplyr solution: sum the next three months using lag and take the sign of the result.
df %>%
group_by(ID) %>%
mutate(quarter = sign(lead(Incomes, 3) + lead(Incomes, 2) + lead(Incomes))) %>%
as.data.frame()
#> ID Month Incomes quarter
#> 1 1 1 5000 0
#> 2 1 2 0 0
#> 3 1 3 0 0
#> 4 1 4 0 1
#> 5 1 5 0 1
#> 6 1 6 0 1
#> 7 1 7 400 1
#> 8 1 8 300 0
#> 9 1 9 0 0
#> 10 1 10 0 1
#> 11 1 11 0 NA
#> 12 1 12 0 NA
#> 13 1 13 400 NA
#> 14 2 1 0 1
#> 15 2 2 100 0
#> 16 2 3 0 0
#> 17 2 4 0 0
#> 18 2 5 0 1
#> 19 2 6 0 1
#> 20 2 7 0 1
#> 21 2 8 1500 0
#> 22 2 9 0 1
#> 23 2 10 0 1
#> 24 2 11 0 NA
#> 25 2 12 100 NA
#> 26 2 13 750 NA
#> 27 3 1 0 0
#> 28 3 2 0 1
#> 29 3 3 0 1
#> 30 3 4 0 1
#> 31 3 5 700 1
#> 32 3 6 240 1
#> 33 3 7 100 0
#> 34 3 8 0 0
#> 35 3 9 0 1
#> 36 3 10 0 1
#> 37 3 11 0 NA
#> 38 3 12 500 NA
#> 39 3 13 760 NA
Another option:
library(dplyr)
df %>%
group_by(ID) %>%
mutate(
Quarterly = c(
sapply(1:(n() - 3), function(x) +any(Incomes[(x + 1):(x + 3)] > 0)),
rep(NA, 3)
)
) %>% as.data.frame
Output:
ID Month Incomes Quarterly
1 1 1 5000 0
2 1 2 0 0
3 1 3 0 0
4 1 4 0 1
5 1 5 0 1
6 1 6 0 1
7 1 7 400 1
8 1 8 300 0
9 1 9 0 0
10 1 10 0 1
11 1 11 0 NA
12 1 12 0 NA
13 1 13 400 NA
14 2 1 0 1
15 2 2 100 0
16 2 3 0 0
17 2 4 0 0
18 2 5 0 1
19 2 6 0 1
20 2 7 0 1
21 2 8 1500 0
22 2 9 0 1
23 2 10 0 1
24 2 11 0 NA
25 2 12 100 NA
26 2 13 750 NA
27 3 1 0 0
28 3 2 0 1
29 3 3 0 1
30 3 4 0 1
31 3 5 700 1
32 3 6 240 1
33 3 7 100 0
34 3 8 0 0
35 3 9 0 1
36 3 10 0 1
37 3 11 0 NA
38 3 12 500 NA
39 3 13 760 NA
And a base equivalent:
transform(df, Quarterly = ave(Incomes, ID,
FUN = function(x) c(
sapply(1:(length(x) - 3), function(y) +any(x[(y + 1):(y + 3)] > 0)),
rep(NA, 3)
)
)
)
Related
This question already has answers here:
How to create a new variable in a data.frame based on a condition?
(2 answers)
Closed 1 year ago.
I would like to create a new variable called x in my datasheet. When there is a 1 or 2 coded on coi_cf i need the value 1 on the new variable x and when there is a 0, 99 or NA coded on coi_cf I need a 0 on the new variable x.
library(dplyr)
df %>%
mutate(x = case_when(coi_cf==1 | coi_cf == 2 ~ 1,
TRUE ~ 0))
coi_cf coi_aufc coi_flafc coi_ra coi_flara x
1 99 0 0 1 1 0
2 99 99 NA 1 3 0
3 99 99 NA 1 3 0
4 0 4 2 1 4 0
5 0 2 4 99 99 0
6 99 4 3 1 3 0
7 99 0 0 0 0 0
8 NA 0 NA 99 NA 0
9 NA 0 0 1 3 0
10 0 0 NA 0 NA 0
11 NA 0 NA 1 3 0
12 NA 0 NA 1 3 0
13 NA 0 NA 1 3 0
14 NA 0 NA 0 NA 0
data:
structure(list(coi_cf = c(99L, 99L, 99L, 0L, 0L, 99L, 99L, NA,
NA, 0L, NA, NA, NA, NA), coi_aufc = c(0L, 99L, 99L, 4L, 2L, 4L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), coi_flafc = c(0L, NA, NA, 2L,
4L, 3L, 0L, NA, 0L, NA, NA, NA, NA, NA), coi_ra = c(1L, 1L, 1L,
1L, 99L, 1L, 0L, 99L, 1L, 0L, 1L, 1L, 1L, 0L), coi_flara = c(1L,
3L, 3L, 4L, 99L, 3L, 0L, NA, 3L, NA, 3L, 3L, 3L, NA)), class = "data.frame", row.names = c(NA,
-14L))
I might be overengineering this a little, but in case you anticipate a lot of values being replaced and want to avoid deep-nesting if-conditions, consider building a "replacement table" (named rtable below) and joining it to your original table (which I'll simply call "df").
rtable <- data.frame(coi_cf=c(0,1,2,99,NA), x=c(0,1,1,0,0))
Using the dplyr package:
newtable <- left_join(df, rtable, by="coi_cf")
my R skills are not sufficient to tackle this issue so I'm hoping someone can help.
My data look like this:
head(human.players,25)
Season
Episode
Round
Player
Player_type
Crowd_size
q1_a
q2_a
q3_a
q4_a
q5_a
2020
1
1
1
1
3
0
1
0
0
NA
2020
1
1
2
1
3
0
1
1
1
NA
2020
1
1
3
1
3
0
0
0
1
NA
2020
1
2
1
1
3
1
1
0
1
NA
2020
1
2
2
1
3
1
0
1
0
NA
2020
1
2
3
1
3
1
1
1
0
NA
2020
1
3
1
1
3
0
1
0
0
NA
2020
1
3
2
1
3
0
1
1
1
NA
2020
1
3
3
1
3
0
0
1
1
NA
2020
1
4
1
1
3
0
0
1
1
NA
2020
1
4
2
1
3
0
0
1
1
NA
2020
1
4
3
1
3
0
0
1
1
NA
2020
1
5
1
1
2
1
1
0
0
NA
2020
1
5
2
1
2
1
1
1
0
NA
2020
1
5
3
1
2
NA
NA
NA
NA
NA
2020
1
6
1
1
2
0
0
0
0
NA
2020
1
6
2
1
2
0
0
0
0
NA
2020
1
6
3
1
2
NA
NA
NA
NA
NA
2020
1
7
1
1
2
0
1
1
1
NA
2020
1
7
2
1
2
1
0
0
1
NA
2020
1
7
3
1
2
NA
NA
NA
NA
NA
2020
2
1
1
1
3
1
1
0
0
NA
2020
2
1
2
1
3
0
0
0
1
NA
2020
2
1
3
1
3
0
1
1
0
NA
Vars from q1_a:q5_a represent whether a player got the question wrong (0) or correct (1). Each player plays in a specific round (there are 7 rounds per episode). In the first 4 rounds, there are 3 players. However, in rounds 5-7, there are only 2 players (the one player that is eliminated has NA's - e.g., in episode 1, this is player 3 - see table above).
I need to create a random player. This means that for the first 4 rounds I need to randomly select an answer (for each of the 5 questions) from the three players within that round and add the "random player" row value. For rounds 5 to 7 I need to select an answer from the two players (ignoring the NA) and add the "random player" row value.
An algorithm of sorts would have to look at round 1 (only those rows), sample one value from the three rows, paste it into round 1 (i.e., create row 4 in this example ) and do that for each of the 5 questions. Then for round 2...
This is how it is supposed to look like where I've added player 4 - the random player:
Season
Episode
Round
Player
Player_type
Crowd_size
q1_a
q2_a
q3_a
q4_a
q5_a
2020
1
1
1
1
3
0
1
0
0
NA
2020
1
1
2
1
3
0
1
1
1
NA
2020
1
1
3
1
3
0
0
0
1
NA
2020
1
1
4
1
3
0
0
1
1
NA
2020
1
2
1
1
3
1
1
0
1
NA
2020
1
2
2
1
3
1
0
1
0
NA
2020
1
2
3
1
3
1
1
1
0
NA
2020
1
2
4
1
3
1
1
1
0
NA
2020
1
3
1
1
3
0
1
0
0
NA
2020
1
3
2
1
3
0
1
1
1
NA
2020
1
3
3
1
3
0
0
1
1
NA
2020
1
3
4
1
3
0
0
0
1
NA
2020
1
4
1
1
3
0
0
1
1
NA
2020
1
4
2
1
3
0
0
1
1
NA
2020
1
4
3
1
3
0
0
1
1
NA
2020
1
4
4
1
3
0
0
1
1
NA
Writing this, I'm thinking it may be impossible or at least very difficult to do this so this question is more like a "hail mary". I presume some combination of sample(), apply(), and creating a custom function is necessary, but I'm stumped.
Here's one pipe that samples the new players and their scores into a separate frame, which you can then bind_rows back into the original data.
set.seed(2021)
newplayers <- dat %>%
filter(!is.na(q1_a)) %>%
group_by(Season, Episode, Round) %>%
summarize(across(everything(), ~ sample(., size=1)), .groups = "drop") %>%
mutate(Player = NA_integer_, Player_type = NA_integer_)
newplayers
# # A tibble: 8 x 11
# Season Episode Round Player Player_type Crowd_size q1_a q2_a q3_a q4_a q5_a
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <lgl>
# 1 2020 1 1 NA NA 3 0 0 1 1 NA
# 2 2020 1 2 NA NA 3 1 1 0 0 NA
# 3 2020 1 3 NA NA 3 0 1 1 0 NA
# 4 2020 1 4 NA NA 3 0 0 1 1 NA
# 5 2020 1 5 NA NA 2 1 1 1 0 NA
# 6 2020 1 6 NA NA 2 0 0 0 0 NA
# 7 2020 1 7 NA NA 2 0 0 1 1 NA
# 8 2020 2 1 NA NA 3 0 1 0 0 NA
bind_rows(dat, newplayers) %>%
arrange(Season, Episode, Round, is.na(Player), Player) %>%
head(.)
# Season Episode Round Player Player_type Crowd_size q1_a q2_a q3_a q4_a q5_a
# 1 2020 1 1 1 1 3 0 1 0 0 NA
# 2 2020 1 1 2 1 3 0 1 1 1 NA
# 3 2020 1 1 3 1 3 0 0 0 1 NA
# 4 2020 1 1 NA NA 3 0 0 1 1 NA
# 5 2020 1 2 1 1 3 1 1 0 1 NA
# 6 2020 1 2 2 1 3 1 0 1 0 NA
I didn't know what values to put into Player*, so I chose NA.
Data
# dput(dat)
dat <- structure(list(Season = c(2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L), Episode = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), Round = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 1L, 1L, 1L), Player = c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), Player_type = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Crowd_size = c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L), q1_a = c(0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, NA, 0L, 0L, NA, 0L, 1L, NA, 1L, 0L, 0L), q2_a = c(1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, NA, 0L, 0L, NA, 1L, 0L, NA, 1L, 0L, 1L), q3_a = c(0L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, NA, 0L, 0L, NA, 1L, 0L, NA, 0L, 0L, 1L), q4_a = c(0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, NA, 0L, 0L, NA, 1L, 1L, NA, 0L, 1L, 0L), q5_a = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), class = "data.frame", row.names = c(NA, -24L))
In my dataset I have information of the ZIPCODE of 600K+ ID's. If ID's move to a different addressess, I want to determine at which zipcode they lived the longest and put a '1' for that specific year in that row (no need to combine rows as I want to know if they where they lived in what year). That way an ID only have a '1' for a certain year at one row (if there are multiple rows for that ID). The yellow highlight is what i don't want; in that case there is a '1' in two rows for the same year. In the preferred dataset there is only one '1' per year per ID possible.
For example: ID 4 lived in 2013 in 2 places (NY and LA), therefore there are 2 rows. At this point there is a 1 in each row for 2013 and I only want a 1 in the row the ID lived the longest between 1-1-2013 and 31-12-2018. ID 4 lived in 2013 longer in LA than in NY, and so only a 1 should be at the row for NY (so in this case the row of LA will be removed because only '0's remain).
I can also put this file in RStudio.
Thank you!
structure(v1)
ID CITY ZIPCODE DATE_START DATE_END DATE_END.1 X2013 X2014 X2015 X2016 X2017 X2018
1 1 NY 1234EF 1-12-2003 31-12-2018 1 1 1 1 1 1
2 2 NY 1234CD 1-12-2003 14-1-2019 14-1-2019 1 1 1 1 1 1
3 2 NY 1234AB 15-1-2019 31-12-2018 0 0 0 0 0 0
4 3 NY 1234AB 15-1-2019 31-12-2018 0 0 0 0 0 0
5 3 NY 1234CD 1-12-2003 14-1-2019 14-1-2019 1 1 1 1 1 1
6 4 LA 1111AB 4-5-2013 31-12-2018 1 1 1 1 1 1
7 4 NY 2222AB 1-12-2003 3-5-2013 3-5-2013 1 0 0 0 0 0
8 5 MIAMI 5555CD 6-2-2015 20-6-2016 20-6-2016 0 0 1 1 0 0
9 5 VEGAS 3333AB 1-1-2004 31-12-2018 1 1 1 1 1 1
10 5 ORLANDO 4444AB 26-2-2004 5-2-2015 5-2-2015 1 1 1 0 0 0
11 5 MIAMI 5555AB 21-6-2016 31-12-2018 31-12-2018 0 0 0 1 1 1
12 5 MIAMI 5555AB 1-1-2019 31-12-2018 0 0 0 0 0 0
13 6 AUSTIN 6666AB 28-2-2017 3-11-2017 3-11-2017 0 0 0 0 1 0
14 6 AUSTIN 6666AB 4-11-2017 31-12-2018 0 0 0 0 1 1
15 6 AUSTIN 7777AB 20-1-2017 27-2-2017 27-2-2017 0 0 0 0 1 0
16 6 AUSTIN 8888AB 1-12-2003 19-1-2017 19-1-2017 1 1 1 1 1 0
>
structure(list(ID = c(1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 5L,
5L, 5L, 6L, 6L, 6L, 6L), CITY = structure(c(4L, 4L, 4L, 4L, 4L,
2L, 4L, 3L, 6L, 5L, 3L, 3L, 1L, 1L, 1L, 1L), .Label = c("AUSTIN",
"LA", "MIAMI", "NY", "ORLANDO", "VEGAS"), class = "factor"),
ZIPCODE = structure(c(4L, 3L, 2L, 2L, 3L, 1L, 5L, 9L, 6L,
7L, 8L, 8L, 10L, 10L, 11L, 12L), .Label = c("1111AB", "1234AB",
"1234CD", "1234EF", "2222AB", "3333AB", "4444AB", "5555AB",
"5555CD", "6666AB", "7777AB", "8888AB"), class = "factor"),
DATE_START = structure(c(3L, 3L, 4L, 4L, 3L, 10L, 3L, 11L,
1L, 7L, 6L, 2L, 8L, 9L, 5L, 3L), .Label = c("1-1-2004", "1-1-2019",
"1-12-2003", "15-1-2019", "20-1-2017", "21-6-2016", "26-2-2004",
"28-2-2017", "4-11-2017", "4-5-2013", "6-2-2015"), class = "factor"),
DATE_END = structure(c(1L, 2L, 1L, 1L, 2L, 1L, 7L, 4L, 1L,
9L, 8L, 1L, 6L, 1L, 5L, 3L), .Label = c("", "14-1-2019",
"19-1-2017", "20-6-2016", "27-2-2017", "3-11-2017", "3-5-2013",
"31-12-2018", "5-2-2015"), class = "factor"), DATE_END.1 = structure(c(7L,
1L, 7L, 7L, 1L, 7L, 6L, 3L, 7L, 8L, 7L, 7L, 5L, 7L, 4L, 2L
), .Label = c("14-1-2019", "19-1-2017", "20-6-2016", "27-2-2017",
"3-11-2017", "3-5-2013", "31-12-2018", "5-2-2015"), class = "factor"),
X2013 = c(1L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L,
0L, 0L, 0L, 1L), X2014 = c(1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L,
1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L), X2015 = c(1L, 1L, 0L, 0L,
1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L), X2016 = c(1L,
1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L
), X2017 = c(1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L,
0L, 1L, 1L, 1L, 1L), X2018 = c(1L, 1L, 0L, 0L, 1L, 1L, 0L,
0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L)), class = "data.frame", row.names = c(NA,
-16L))
You can use a little help from the lubridate package to calculate how many days are spent at each location. Then we can group_by ID and use case_when to assign 1 when the time is the max or 0 otherwise.
library(lubridate)
library(dplyr)
v1 %>%
dplyr::select(ID,CITY,ZIPCODE,DATE_START,DATE_END.1) %>%
rowwise() %>%
mutate("X2013" = max(0, min(dmy("31-12-2013"),dmy(DATE_END.1)) - max(dmy("1-1-2013"),dmy(DATE_START))),
"X2014" = max(0, min(dmy("31-12-2014"),dmy(DATE_END.1)) - max(dmy("1-1-2014"),dmy(DATE_START))),
"X2015" = max(0, min(dmy("31-12-2015"),dmy(DATE_END.1)) - max(dmy("1-1-2015"),dmy(DATE_START))),
"X2016" = max(0, min(dmy("31-12-2016"),dmy(DATE_END.1)) - max(dmy("1-1-2016"),dmy(DATE_START))),
"X2017" = max(0, min(dmy("31-12-2017"),dmy(DATE_END.1)) - max(dmy("1-1-2017"),dmy(DATE_START))),
"X2018" = max(0, min(dmy("31-12-2018"),dmy(DATE_END.1)) - max(dmy("1-1-2018"),dmy(DATE_START)))) %>%
ungroup %>%
group_by(ID) %>%
mutate_at(vars(starts_with("X")),list(~ case_when(. == max(.) ~ 1,
TRUE ~ 0)))
# A tibble: 16 x 11
# Groups: ID [6]
ID CITY ZIPCODE DATE_START DATE_END.1 X2013 X2014 X2015 X2016 X2017 X2018
<int> <fct> <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 NY 1234EF 1-12-2003 31-12-2018 1 1 1 1 1 1
2 2 NY 1234CD 1-12-2003 14-1-2019 1 1 1 1 1 1
3 2 NY 1234AB 15-1-2019 31-12-2018 0 0 0 0 0 0
4 3 NY 1234AB 15-1-2019 31-12-2018 0 0 0 0 0 0
5 3 NY 1234CD 1-12-2003 14-1-2019 1 1 1 1 1 1
6 4 LA 1111AB 4-5-2013 31-12-2018 1 1 1 1 1 1
7 4 NY 2222AB 1-12-2003 3-5-2013 0 0 0 0 0 0
8 5 MIAMI 5555CD 6-2-2015 20-6-2016 0 0 0 0 0 0
9 5 VEGAS 3333AB 1-1-2004 31-12-2018 1 1 1 1 1 1
10 5 ORLANDO 4444AB 26-2-2004 5-2-2015 1 1 0 0 0 0
11 5 MIAMI 5555AB 21-6-2016 31-12-2018 0 0 0 0 1 1
12 5 MIAMI 5555AB 1-1-2019 31-12-2018 0 0 0 0 0 0
13 6 AUSTIN 6666AB 28-2-2017 3-11-2017 0 0 0 0 1 0
14 6 AUSTIN 6666AB 4-11-2017 31-12-2018 0 0 0 0 0 1
15 6 AUSTIN 7777AB 20-1-2017 27-2-2017 0 0 0 0 0 0
16 6 AUSTIN 8888AB 1-12-2003 19-1-2017 1 1 1 1 0 0
There is certainly a way that one could implement the first mutate call to not require manually writing each year, but would take much more work than just typing it out.
I have a matrix with all cell values being from -1:1 and another dataframe that identifies row/column cells for each cell I need to find.
I want to add columns to the data frame that contain the values in the cells identified in the matrix. Example of what I have and what I want below:
HAVE MATRIX:
1 2 3 4 5 6 7 8 9 10
1 0 0 0 1 0 1 1 0 0 0
2 0 0 -1 -1 1 1 -1 -1 0 0
3 0 0 -1 0 0 -1 0 -1 0 0
4 -1 1 0 0 1 1 0 -1 1 -1
5 -1 1 0 -1 -1 0 0 0 0 1
6 1 -1 1 1 0 0 -1 -1 0 1
7 0 -1 1 0 1 1 0 1 -1 0
8 0 0 -1 -1 -1 0 1 -1 0 1
9 -1 1 0 1 1 -1 0 1 -1 -1
10 -1 1 -1 -1 -1 -1 1 0 1 -1
HAVE DATAFRAME:
i j k
1 3 4 2
2 4 8 10
3 10 7 5
4 2 6 8
5 9 10 7
6 2 10 4
7 7 8 10
8 6 10 8
9 2 9 5
10 9 7 1
WANT DATAFRAME:
i j k j,i k,i k,j
1 3 4 2 0 -1 -1
2 4 8 10 -1 -1 0
3 10 7 5 0 1 0
4 2 6 8 -1 0 0
5 9 10 7 1 -1 0
6 2 10 4 1 1 -1
7 7 8 10 1 1 0
8 6 10 8 -1 0 1
9 2 9 5 1 1 0
10 9 7 1 -1 0 1
One option would be to use combn or sapply(if the combination needs to be in a specific order, loop through the list, extract the columns of second dataset, use that as row/column index of extracting the corresponding elements from first dataset and cbind
indList <- list(ji = c('j', 'i'), ki = c('k', 'i'), kj = c('k', 'j'))
cbind(df2, sapply(indList, function(x) m1[as.matrix(df2[x])]))
# i j k ji ki kj
#1 3 4 2 0 -1 -1
#2 4 8 10 -1 -1 0
#3 10 7 5 0 1 0
#4 2 6 8 -1 0 0
#5 9 10 7 1 -1 0
#6 2 10 4 1 1 -1
#7 7 8 10 1 1 0
#8 6 10 8 -1 0 1
#9 2 9 5 1 1 0
#10 9 7 1 -1 0 1
It is also possible with combn)
cbind(df2, combn(df2, 2, FUN = function(x) m1[as.matrix(x[2:1])]))
data
df2 <- structure(list(i = c(3L, 4L, 10L, 2L, 9L, 2L, 7L, 6L, 2L, 9L),
j = c(4L, 8L, 7L, 6L, 10L, 10L, 8L, 10L, 9L, 7L), k = c(2L,
10L, 5L, 8L, 7L, 4L, 10L, 8L, 5L, 1L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10"))
m1 <- structure(c(0L, 0L, 0L, -1L, -1L, 1L, 0L, 0L, -1L, -1L, 0L, 0L,
0L, 1L, 1L, -1L, -1L, 0L, 1L, 1L, 0L, -1L, -1L, 0L, 0L, 1L, 1L,
-1L, 0L, -1L, 1L, -1L, 0L, 0L, -1L, 1L, 0L, -1L, 1L, -1L, 0L,
1L, 0L, 1L, -1L, 0L, 1L, -1L, 1L, -1L, 1L, 1L, -1L, 1L, 0L, 0L,
1L, 0L, -1L, -1L, 1L, -1L, 0L, 0L, 0L, -1L, 0L, 1L, 0L, 1L, 0L,
-1L, -1L, -1L, 0L, -1L, 1L, -1L, 1L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, -1L, 0L, -1L, 1L, 0L, 0L, 0L, -1L, 1L, 1L, 0L, 1L, -1L, -1L
), .Dim = c(10L, 10L), .Dimnames = list(c("1", "2", "3", "4",
"5", "6", "7", "8", "9", "10"), c("1", "2", "3", "4", "5", "6",
"7", "8", "9", "10")))
I am struggling to find a simple way to fill values based on two simple conditions.
I am trying to fill the variable working with 1 after the first and last "1" for each dayweek. The example is more telling.
id hours dayweek working
1 1 1 Friday 0
2 1 2 Friday 0
3 1 3 Friday 0
4 1 4 Friday 0
5 1 5 Friday 0
6 1 6 Friday 0
7 1 7 Friday 0
8 1 8 Friday 1
9 1 9 Friday 0
10 1 10 Friday 0
11 1 11 Friday 0
12 1 12 Friday 0
13 1 13 Friday 0
14 1 14 Friday 0
15 1 15 Friday 0
16 1 16 Friday 0
17 1 17 Friday 1
18 1 18 Friday 0
19 1 19 Friday 0
20 1 20 Friday 0
I am trying to do this.
id hours dayweek working
1 1 1 Friday 0
2 1 2 Friday 0
3 1 3 Friday 0
4 1 4 Friday 0
5 1 5 Friday 0
6 1 6 Friday 0
7 1 7 Friday 0
8 1 8 Friday 1
9 1 9 Friday 1
10 1 10 Friday 1
11 1 11 Friday 1
12 1 12 Friday 1
13 1 13 Friday 1
14 1 14 Friday 1
15 1 15 Friday 1
16 1 16 Friday 1
17 1 17 Friday 1
18 1 18 Friday 0
19 1 19 Friday 0
20 1 20 Friday 0
The group_by must be id and dayweek.
Any clue ?
The data
structure(list(id = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2", "3"), class = "factor"), hours = 1:20, dayweek = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("Friday", "Monday", "Saturday", "Sunday",
"Thursday", "Tuesday", "Wedesnday"), class = "factor"), working = c(0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0)), row.names = c(NA,
20L), class = "data.frame", .Names = c("id", "hours", "dayweek",
"working"))
alternative data of same problem
dt = structure(list(X = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 29L, 30L,
31L, 32L, 33L, 34L, 35L, 36L, 57L, 58L, 59L, 60L, 61L, 62L, 63L,
64L), id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), hours = c(1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L), dayweek = structure(c(1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L), .Label = c("Friday", "Monday", "Saturday",
"Sunday", "Thursday", "Tuesday", "Wedesnday"), class = "factor"),
working = c(0L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 0L,
0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), class = "data.frame", row.names = c(NA,
-24L), .Names = c("X", "id", "hours", "dayweek", "working"))
We could use data.table to do this. We convert the 'data.frame' to 'data.table' (setDT(df1)). Grouped by the 'id' and 'dayweek', we get the numeric index of elements in 'working' which is equal to 1 ('tmp') on if there is atleast an 1 value in the group (if(any(working==1))). Get the sequence (:) between first (head(tmp,1)) and last (tail(tmp, 1)) position and wrap it with .I to get the row index ('i1'). Use the index and assign the 'working' elements corresponding to that row as 1.
library(data.table)
i1 <- setDT(df1)[, if(any(working==1)){tmp <- which(working==1)
.I[head(tmp,1):tail(tmp,1)]} , by = .(id, dayweek)]$V1
df1[i1, working:=1L]
df1
# id hours dayweek working
# 1: 1 1 Friday 0
# 2: 1 2 Friday 0
# 3: 1 3 Friday 0
# 4: 1 4 Friday 0
# 5: 1 5 Friday 0
# 6: 1 6 Friday 0
# 7: 1 7 Friday 0
# 8: 1 8 Friday 1
# 9: 1 9 Friday 1
#10: 1 10 Friday 1
#11: 1 11 Friday 1
#12: 1 12 Friday 1
#13: 1 13 Friday 1
#14: 1 14 Friday 1
#15: 1 15 Friday 1
#16: 1 16 Friday 1
#17: 1 17 Friday 1
#18: 1 18 Friday 0
#19: 1 19 Friday 0
#20: 1 20 Friday 0
Or a similar solution using dplyr (as suggested by #David Arenburg) would be to group by 'id', 'dayweek' columns, use the min and max to get the first and last positions where working == 1, and replace those elements in working with 1. If there are no 1 value for a particular group, we can wrap with ifelse to return 0 for those group.
library(dplyr)
df1 %>%
group_by(id, dayweek) %>%
mutate(new = any(working ==1),
working = ifelse(new, replace(working,
min(which(working == 1)):max(which(working == 1)), 1),
as.numeric(new))) %>%
select(-new)
#Source: local data frame [20 x 4]
#Groups: id, dayweek
#
# id hours dayweek working
#1 1 1 Friday 0
#2 1 2 Friday 0
#3 1 3 Friday 0
#4 1 4 Friday 0
#5 1 5 Friday 0
#6 1 6 Friday 0
#7 1 7 Friday 0
#8 1 8 Friday 1
#9 1 9 Friday 1
#10 1 10 Friday 1
#11 1 11 Friday 1
#12 1 12 Friday 1
#13 1 13 Friday 1
#14 1 14 Friday 1
#15 1 15 Friday 1
#16 1 16 Friday 1
#17 1 17 Friday 1
#18 1 18 Friday 0
#19 1 19 Friday 0
#20 1 20 Friday 0
Or a compact option suggested by #Khashaa where we multiply the cummax of 'working' with cummax of reverse (rev) of 'working' so that only elements that are 1 in both the vectors remain as 1 while others will be replaced by 0.
df1 %>%
group_by(id, dayweek) %>%
mutate(working = cummax(working)*rev(cummax(rev(working))))