Sorry if this post is not well organized, first time stack overflower...
I am trying to create a column to create a order within each IDs, but the twist is that if there is a gap year, order needs to start from the beginning.
Please check example and expected result below.
I wasn't able to find appropriate code for it.. I cannot think of anything :( Please help me! I appreciate alot!
One option is to create a new group variable when difference between the year is greater than 1 and create a sequence in each group using row_number().
library(dplyr)
df %>%
group_by(ID, group = cumsum(c(1, diff(Year) > 1))) %>%
mutate(order = row_number()) %>%
ungroup() %>%
select(-group)
# ID Year order
# <fct> <int> <int>
# 1 A 2007 1
# 2 A 2008 2
# 3 A 2009 3
# 4 A 2013 1
# 5 A 2014 2
# 6 A 2015 3
# 7 A 2016 4
# 8 B 2010 1
# 9 B 2012 1
#10 B 2013 2
Using base R ave that would be
as.integer(with(df, ave(ID, ID, cumsum(c(1, diff(Year) > 1)), FUN = seq_along)))
#[1] 1 2 3 1 2 3 4 1 1 2
data
df <- data.frame(ID = c(rep("A", 7), rep("B", 3)),
Year = c(2007:2009, 2013:2016, 2010, 2012, 2013), stringsAsFactors = FALSE)
A data.table option:
library(data.table)
setDT(df)
df[, jump := Year - shift(Year) - 1, by = ID
][is.na(jump), jump := 0
][, order := seq_len(.N), by = .(ID, cumsum(jump))]
# ID Year jump order
# 1: A 2007 0 1
# 2: A 2008 0 2
# 3: A 2009 0 3
# 4: A 2013 3 1
# 5: A 2014 0 2
# 6: A 2015 0 3
# 7: A 2016 0 4
# 8: B 2010 0 1
# 9: B 2012 1 1
# 10: B 2013 0 2
Or using data.table::nafill() available in data.table v1.12.3 (still in development):
df[, jump := nafill(Year - shift(Year) - 1, fill = 0), by = ID
][, order := seq_len(.N), by = .(ID, cumsum(jump))]
We can take the difference of 'Year' and the lag of 'Year', get the cumulative sum, use that in the group_by along with 'ID' and create the order as row_number()
library(dplyr)
df %>%
group_by(ID, grp = cumsum(Year - lag(Year, default = Year[1]) > 1)) %>%
mutate(order = row_number()) %>%
ungroup %>%
select(-grp)
# A tibble: 10 x 3
# ID Year order
# <chr> <dbl> <int>
# 1 A 2007 1
# 2 A 2008 2
# 3 A 2009 3
# 4 A 2013 1
# 5 A 2014 2
# 6 A 2015 3
# 7 A 2016 4
# 8 B 2010 1
# 9 B 2012 1
#10 B 2013 2
data
df <- structure(list(ID = c("A", "A", "A", "A", "A", "A", "A", "B",
"B", "B"), Year = c(2007, 2008, 2009, 2013, 2014, 2015, 2016,
2010, 2012, 2013)), class = "data.frame", row.names = c(NA, -10L
))
Related
I have a dataset where each row represents a continuous spell with start and end months and years. For spells which are over more than one year, I want to pivot them so that there is one row per year.
Input:
library(data.table)
dat <- data.table(id = c(1,1,2), b_sp_y = c(2008, 2009, 2011), b_sp_m = c(3, 8, 6),
e_sp_y = c(2008, 2010, 2013), e_sp_m = c(5, 1, 9))
id b_sp_y b_sp_m e_sp_y e_sp_m
1: 1 2008 3 2008 5
2: 1 2009 8 2010 1
3: 2 2011 6 2013 9
Here is my truly horrifyingly ugly code:
dat[, y_dif := e_sp_y - b_sp_y]
res <- dat[y_dif == 0][, c("e_sp_y", "y_dif") := NULL]
setnames(res, "b_sp_y", "year")
tmp <- dat[y_dif > 0]
for(i in 1:nrow(tmp)){
foo <- tmp[i, ]
foo2 <- data.table(year = foo$b_sp_y:(foo$b_sp_y + foo$y_dif))[,id := foo$id]
foo2[, b_sp_m := c(foo$b_sp_m, rep(1, foo$y_dif))]
foo2[, e_sp_m := c(rep(12, foo$y_dif), foo$e_sp_m)]
res <- rbind(res, foo2)
}
Output:
id year b_sp_m e_sp_m
1: 1 2008 3 5
2: 1 2009 8 12
3: 1 2010 1 1
4: 2 2011 6 12
5: 2 2012 1 12
6: 2 2013 1 9
This is ugly and slow to a crawl, but I couldn't really come up with anything better.
Thanks for your help!
Proceeding by row fill in the three columns using summarize as shown.
library(data.table)
library(dplyr)
dat %>%
rowwise() %>%
summarize(id = id,
year = b_sp_y:e_sp_y,
b_sp_m = replace(1 + 0 * year, 1, b_sp_m),
e_sp_m = replace(12 + 0 * year, length(year), e_sp_m))
giving:
# A tibble: 6 × 4
id year b_sp_m e_sp_m
<dbl> <int> <dbl> <dbl>
1 1 2008 3 5
2 1 2009 8 12
3 1 2010 1 1
4 2 2011 6 12
5 2 2012 1 12
6 2 2013 1 9
or using only data.table:
library(data.table)
dat[, .(id = id,
year = b_sp_y:e_sp_y,
b_sp_m = replace(1 + 0 * b_sp_y:e_sp_y, 1, b_sp_m),
e_sp_m = replace(12 + 0 * b_sp_y:e_sp_y, e_sp_y - b_sp_y + 1, e_sp_m)),
by = 1:nrow(dat)][, -1]
Added
Here are some slightly more compact variations of the above:
library(data.table)
library(dplyr)
dat %>%
rowwise() %>%
summarize(id = id,
year = b_sp_y:e_sp_y,
b_sp_m = c(b_sp_m, year[-1]^0),
e_sp_m = c(12 * year[-1]^0, e_sp_m))
library(data.table)
dat[, {
year <- b_sp_y:e_sp_y
.(id = id,
year = year,
b_sp_m = c(b_sp_m, year[-1]^0),
e_sp_m = c(12 * year[-1]^0, e_sp_m))
},
by = 1:nrow(dat)][, -1]
I'd suggest: make a date sequence for each id/row, group by id and year, summarize first and last month.
library(dplyr); library(lubridate)
dat %>%
mutate(start = ymd(paste(b_sp_y, b_sp_m, "01", sep = "-")),
end = ymd(paste(e_sp_y, e_sp_m, "01", sep = "-"))) %>%
group_by(id, row = row_number()) %>%
summarize(months = seq.Date(start, end, by = "month")) %>%
group_by(id, year = year(months)) %>%
summarize(from = month(min(months)),
to = month(max(months)), .groups = "drop")
Result:
# A tibble: 6 × 4
id year from to
<dbl> <dbl> <dbl> <dbl>
1 1 2008 3 5
2 1 2009 8 12
3 1 2010 1 1
4 2 2011 6 12
5 2 2012 1 12
6 2 2013 1 9
We create a sequence column 'rn', loop over the year columns, get the sequence in a list, unnest the column, and do a group by the 'rn' and replace the 'b', 'e' columns where there are duplicates to 1 and 12 respectively
library(dplyr)
library(purrr)
library(tidyr)
dat %>%
mutate(rn=row_number(),
year = map2(b_sp_y, e_sp_y, `:`),
b_sp_y= NULL,
e_sp_y = NULL) %>%
unnest(year) %>%
group_by(rn) %>%
mutate(b_sp_m = replace(b_sp_m, duplicated(b_sp_m), 1),
e_sp_m = replace(e_sp_m, duplicated(e_sp_m, fromLast = TRUE) &
n() > 1, 12)) %>%
ungroup %>%
select(-rn) %>%
relocate(year, .after = 1)
-output
# A tibble: 6 × 4
id year b_sp_m e_sp_m
<dbl> <int> <dbl> <dbl>
1 1 2008 3 5
2 1 2009 8 12
3 1 2010 1 1
4 2 2011 6 12
5 2 2012 1 12
6 2 2013 1 9
OP's output of 'res'
> res
id year b_sp_m e_sp_m
<num> <num> <num> <num>
1: 1 2008 3 5
2: 1 2009 8 12
3: 1 2010 1 1
4: 2 2011 6 12
5: 2 2012 1 12
6: 2 2013 1 9
I want to keep distinct rows on a data frame, with an algorithm that chooses the last value per group (as dplyr::distinct() does by default), but only if it's not NA. I've seen this great answer on SO that relies on data.table, but I can't scale it to data with more than one grouping variable.
To demonstrate the problem, I start with the minimal example that does work, then scale it up. So first, consider the following data:
library(tibble)
df_id_and_type <-
tibble::tribble(
~id, ~type,
1, "A",
1, NA,
2, "B",
3, "A",
3, NA,
3, "D",
3, NA,
4, NA,
4, "C",
5, "A",
6, NA,
6, "B",
6, NA
)
I want to get the distinct type values per id, by choosing the last value unless it's NA. If the last is NA then go up until there's non-NA. So this answer shows us how to do it with data.table:
library(data.table)
dt_id_and_type <- as.data.table(df_id_and_type)
dt_id_and_type$typena <- is.na(dt_id_and_type$type)
setorderv(dt_id_and_type, c("typena","id"), order = c(-1, 1))
dt_id_and_type[!duplicated(id, fromLast = TRUE), c("id", "type"), with = FALSE]
#> id type
#> 1: 1 A
#> 2: 2 B
#> 3: 3 D
#> 4: 4 C
#> 5: 5 A
#> 6: 6 B
But what to do if we have more than one grouping variable (i.e, not only id)? In the following example I add a year variable:
df_id_year_and_type <-
df_id_and_type %>%
add_column(year = c(2002, 2002, 2008, 2010, 2010, 2010, 2013, 2020, 2020, 2009, 2010, 2010, 2012),
.before = "type")
df_id_year_and_type
#> # A tibble: 13 x 3
#> id year type
#> <dbl> <dbl> <chr>
#> 1 1 2002 A
#> 2 1 2002 <NA>
#> 3 2 2008 B
#> 4 3 2010 A
#> 5 3 2010 <NA>
#> 6 3 2010 D
#> 7 3 2013 <NA>
#> 8 4 2020 <NA>
#> 9 4 2020 C
#> 10 5 2009 A
#> 11 6 2010 <NA>
#> 12 6 2010 B
#> 13 6 2012 <NA>
My expected output would be:
## # A tibble: 8 x 3
## id year type
## <dbl> <dbl> <chr>
## 1 1 2002 A
## 2 2 2008 B
## 3 3 2010 D
## 4 3 2013 NA # for id 3 in year 2013 there was only `NA`, so that's what we get
## 5 4 2020 C
## 6 5 2009 A
## 7 6 2010 B
## 8 6 2012 NA # same as comment above
Any idea how I could scale the solution that worked in 1-grouping-var case to the current data? The first 2 lines of code are no-brainer:
dt_id_year_and_type <- as.data.table(df_id_year_and_type)
dt_id_year_and_type$typena <- is.na(dt_id_year_and_type$type)
setorderv(dt_id_year_and_type, c("typena","id"), order = c(-1, 1)) # <--- how to account for `year`?
dt_id_year_and_type[!duplicated(id, fromLast = TRUE), c("id", "type"), with = FALSE] # <--- here too...
Here some data.table-based solutions.
setDT(df_id_year_and_type)
method 1
na.omit(df_id_year_and_type, cols="type") drops NA rows based on column type.
unique(df_id_year_and_type[, .(id, year)], fromLast=TRUE) finds all the groups.
And by joining them (using the last match: mult="last"), we obtain the desired output.
na.omit(df_id_year_and_type, cols="type"
)[unique(df_id_year_and_type[, .(id, year)], fromLast=TRUE),
on=c('id', 'year'),
mult="last"]
# id year type
# <num> <num> <char>
# 1: 1 2002 A
# 2: 2 2008 B
# 3: 3 2010 D
# 4: 3 2013 <NA>
# 5: 4 2020 C
# 6: 5 2009 A
# 7: 6 2010 B
# 8: 6 2012 <NA>
method 2
df_id_year_and_type[df_id_year_and_type[, .I[which.max(cumsum(!is.na(type)))], .(id, year)]$V1,]
method 3
(likely slower because of [ overhead)
df_id_year_and_type[, .SD[which.max(cumsum(!is.na(type)))], .(id, year)]
I would propose this solution in which you exclude the unwanted rows prior to unique. If all observations for a group are NA, sum(is.na(x)) / .N is equal to 1 and we proceed from there
library(tibble)
library(data.table)
df_id_and_type <-
tibble::tribble(
~id, ~type,
1, "A",
1, NA,
2, "B",
3, "A",
3, NA,
3, "D",
3, NA,
4, NA,
4, "C",
5, "A",
6, NA,
6, "B",
6, NA
)
df_id_year_and_type <-
df_id_and_type %>%
add_column(year = c(2002, 2002, 2008, 2010, 2010, 2010, 2013, 2020, 2020, 2009, 2010, 2010, 2012),
.before = "type")
# convert to data.table
dt_id_year_and_type <- as.data.table(df_id_year_and_type)
# define grouping vars
grouping_vars <- c("id", "year")
# are all types na for a group?
dt_id_year_and_type[, na_ratio := sum(is.na(type)) / .N,
by = c(grouping_vars)]
# remove all lines that are NA, except they are from a group in which all
# observations are NA
dt_id_year_and_type <- dt_id_year_and_type[na_ratio == 1 | !is.na(type)]
# sort correctly
setorderv(dt_id_year_and_type, grouping_vars)
dt_id_year_and_type
#> id year type na_ratio
#> 1: 1 2002 A 0.5000000
#> 2: 2 2008 B 0.0000000
#> 3: 3 2010 A 0.3333333
#> 4: 3 2010 D 0.3333333
#> 5: 3 2013 <NA> 1.0000000
#> 6: 4 2020 C 0.5000000
#> 7: 5 2009 A 0.0000000
#> 8: 6 2010 B 0.5000000
#> 9: 6 2012 <NA> 1.0000000
# keep only the last observation of each group
dt_unique <- unique(dt_id_year_and_type, by = grouping_vars, fromLast = TRUE)
remove no longer needed helper column
dt_unique[, na_ratio := NULL]
dt_unique
#> id year type
#> 1: 1 2002 A
#> 2: 2 2008 B
#> 3: 3 2010 D
#> 4: 3 2013 <NA>
#> 5: 4 2020 C
#> 6: 5 2009 A
#> 7: 6 2010 B
#> 8: 6 2012 <NA>
Another possible solution:
library(tidyverse)
df_id_year_and_type %>%
group_by(id, year) %>%
fill(type, .direction = "downup") %>%
summarise(type = last(type), .groups = "drop")
#> # A tibble: 8 × 3
#> id year type
#> <dbl> <dbl> <chr>
#> 1 1 2002 A
#> 2 2 2008 B
#> 3 3 2010 D
#> 4 3 2013 <NA>
#> 5 4 2020 C
#> 6 5 2009 A
#> 7 6 2010 B
#> 8 6 2012 <NA>
library(dplyr)
A simple, easy to read example of the basic case is
df_id_and_type %>% filter(!is.na(type)) %>%
filter(id != lead(id) | id == max(id))
extending to the second criteria
df_id_year_and_type %>% filter(!is.na(type)) %>%
filter((id != lead(id) | id == max(id)) &
(year != lead(year) | year == max(year)))
It is clear and easy to understand. If you wish to retain the distinct groupings with no result you can either merge distinct back or insert another OR clause in the filters
Why not using a simple max ?
setDT(df_id_year_and_type)
df_id_year_and_type[,max(type, na.rm=T), by=.(id, year)]
You will get a warning when there is only NA and the option na.rm is TRUE, but you can suppress it easily:
df_id_year_and_type[,suppressWarnings(max(type, na.rm=T)), by=.(id, year)]
Or alternatively, test if all values are NA:
df_id_year_and_type[,ifelse(all(is.na(type)), NA_character_, max(type, na.rm=T)), by=.(id, year)]
Let me illustrate my question with an example:
Sample data:
df<-data.frame(BirthYear = c(1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005), Number= c(1,1,1,1,1,1,1,1,1,1,1), Group = c("g", "g", "g", "g", "g", "g","t","t","t","t","t"))
df
BirthYear Number Group
1 1995 1 g
2 1996 1 g
3 1997 1 g
4 1998 1 g
5 1999 1 g
6 2000 1 g
7 2001 1 t
8 2002 1 t
9 2003 1 t
10 2004 1 t
11 2005 1 t
and
df1<- structure(list(Year = c(2015, 2016, 2017, 2018, 2019, 2020)), class = "data.frame", row.names = c(NA,
-6L))
df1
Year
1 2015
2 2016
3 2017
4 2018
5 2019
6 2020
Now I want to add new columns to df1: g1, g2, t1 and t2.
g1 and t1 respectively represent the sum of df$Number for all instances of a group (g or t in df) where df1$Year - df$BirthYear is greater than 18 and lower than 21, so basically if someone is in the age between 19 & 20.
g2 and t2 represent the sum of df$Number for all instances of a group where the difference in years is lower than 19.
I want to end up with the following:
df1
Year g1 g2 t1 t2
1 2015 2 4 0 5
2 2016 2 3 0 5
3 2017 2 2 0 5
4 2018 2 1 0 5
5 2019 2 0 0 5
6 2020 1 0 1 4
I know I could make a for-loop over df1 to create the new columns but I don't know how to specify the condition to get the correct group sums for each year.
I hope this example makes clear what I'm trying to achieve.
I'd be very grateful for any help cause I'm really stuck at this point.
If what you want to do is just to calculate year differences across 2015:2020 and BirthYear, then you don't have to create a separate dataframe. Perhaps just
library(tidyr)
library(dplyr)
df %>%
expand(Year = 2015:2020, nesting(BirthYear, Number, Group)) %>%
group_by(Year, Group) %>%
summarise(
`1` = sum(between(Year - BirthYear, 19, 20) * Number),
`2` = sum((Year - BirthYear < 19) * Number)
) %>%
pivot_wider(names_from = "Group", values_from = c("1", "2"), names_glue = "{Group}{.value}")
Output
`summarise()` regrouping output by 'Year' (override with `.groups` argument)
# A tibble: 6 x 5
# Groups: Year [6]
Year g1 t1 g2 t2
<int> <dbl> <dbl> <dbl> <dbl>
1 2015 2 0 4 5
2 2016 2 0 3 5
3 2017 2 0 2 5
4 2018 2 0 1 5
5 2019 2 0 0 5
6 2020 1 1 0 4
How do I convert the dataframe?
Before:
set.seed(1)
df <- data.frame( n = rpois(16, 2),
year = rep(2011, 16),
month = rep(seq(1,4,1), times = rep(4,4)))
After:
df1 <- data.frame( n = c(8,11,4,9),
year = rep(2011, 4),
month = rep(seq(1,4,1)))
I think that what you want is this, using dplyr:
library(dplyr)
df %>%
group_by(year, month) %>%
summarise(n = sum(n))
# A tibble: 4 x 3
# Groups: year [1]
year month n
<dbl> <dbl> <int>
1 2011 1 8
2 2011 2 11
3 2011 3 4
4 2011 4 9
Using base R with aggregate
aggregate(n ~ ., df, sum)
# year month n
#1 2011 1 8
#2 2011 2 11
#3 2011 3 4
#4 2011 4 9
I am trying to expand a data frame in R with missing observations that are not immediately obvious. Here is what I mean:
data.frame(id = c("a","b"),start = c(2002,2004), end = c(2005,2007))
Which is:
id start end
1 a 2002 2005
2 b 2004 2007
What I would like is a new data frame with 8 total observations, 4 each for "a" and "b", and a year that is one of the values between start and end (inclusive). So:
id year
a 2002
a 2003
a 2004
a 2005
b 2004
b 2005
b 2006
b 2007
As I understand, various versions of expand only work on unique values, but here my data frame doesn't have all the unique values (explicitly).
I was thinking to step through each row and then generate a data frame with sapply(), then join all the new data frames together. But this attempt fails:
sapply(test,function(x) { data.frame( id=rep(id,x[["end"]]-x[["start"]]), year = x[["start"]]:x[["end"]] )})
I know there must be some dplyr or other magic to solve this problem!
you could use tidyr and dplyr
library(tidyr)
library(dplyr)
df %>%
gather(key = key, value = year, -id) %>%
select(-key) %>%
group_by(id) %>%
complete(year = full_seq(year,1))
# A tibble: 8 x 2
# Groups: id [2]
id year
<fct> <dbl>
1 a 2002
2 a 2003
3 a 2004
4 a 2005
5 b 2004
6 b 2005
7 b 2006
8 b 2007
Using dplyr and tidyr, I make a new column which contains the list of years, then unnest the dataframe.
library(tidyr)
library(dplyr)
df <-
data.frame(
id = c("a", "b"),
start = c(2002, 2004),
end = c(2005, 2007)
)
df %>%
rowwise() %>%
mutate(year = list(seq(start, end))) %>%
select(-start, -end) %>%
unnest()
Output
# A tibble: 8 x 2
id year
<fct> <int>
1 a 2002
2 a 2003
3 a 2004
4 a 2005
5 b 2004
6 b 2005
7 b 2006
8 b 2007
An easy solution with data.table:
library(data.table)
# option 1
setDT(df)[, .(year = seq(start, end)), by = id]
# option 2
setDT(df)[, .(year = start:end), by = id]
which gives:
id year
1: a 2002
2: a 2003
3: a 2004
4: a 2005
5: b 2004
6: b 2005
7: b 2006
8: b 2007
An approach with base R:
lst <- Map(seq, df$start, df$end)
data.frame(id = rep(df$id, lengths(lst)), year = unlist(lst))