How to extend calendar periods to complete a dataframe in R? - r

The code posted at the bottom does a nice job of filling in a dataframe, using package tidyr, so that all ID's end up with the same number of periods, in the case of period defined as number of months ("Period_1" in the below code). Base dataframe testDF has ID of 1 with 5 periods, and ID of 50 and 60 with only 3 periods each. The tidyr code creates additional periods ("Period_1") for ID of 50 and 60 so they too have 5 Period_1´s. The code copies down the "Bal" and "State" fields so that all ID end up with the same number of Period_1, which is correct.
However, how would I extend the calendar month expression of "Period_2" in the same manner, as illustrated immediately below?
Code:
library(tidyr)
testDF <-
data.frame(
ID = as.numeric(c(rep(1,5),rep(50,3),rep(60,3))),
Period_1 = as.numeric(c(1:5,1:3,1:3)),
Period_2 = c("2012-06","2012-07","2012-08","2012-09","2012-10","2013-06","2013-07","2013-08","2012-01","2012-02","2012-03"),
Bal = as.numeric(c(rep(10,5),21:23,36:34)),
State = c("XX","AA","BB","CC","XX","AA","BB","CC","SS","XX","AA")
)
testDFextend <-
testDF %>%
tidyr::complete(ID, nesting(Period_1)) %>%
tidyr::fill(Bal, State, .direction = "down")
testDFextend
Edit: rolling from one year to the next
A better OP example would have Period 2 = c("2012-06","2012-07","2012-08","2012-09","2012-10","2013-06","2013-07","2013-08","2012-10","2012-11","2012-12"), providing an example whereby extending Period_2 causes a rollover to the next year. Below I add to the tidyr/dplyr answer below to correctly roll over the year:
library(tidyr)
library(dplyr)
testDF <-
data.frame(
ID = as.numeric(c(rep(1,5),rep(50,3),rep(60,3))),
Period_1 = as.numeric(c(1:5,1:3,1:3)),
Period_2 = c("2012-06","2012-07","2012-08","2012-09","2012-10","2013-06","2013-07","2013-08","2012-10","2012-11","2012-12"),
Bal = as.numeric(c(rep(10,5),21:23,36:34)),
State = c("XX","AA","BB","CC","XX","AA","BB","CC","SS","XX","AA")
)
testDFextend <-
testDF %>%
tidyr::complete(ID, nesting(Period_1)) %>%
tidyr::fill(Bal, State, .direction = "down")
testDFextend %>%
separate(Period_2, into = c("year", "month"), convert = TRUE) %>%
fill(year) %>%
group_by(ID) %>%
mutate(month = sprintf("%02d", zoo::na.spline(month))) %>%
unite("Period_2", year, month, sep = "-") %>%
# Now I add the below lines:
separate(Period_2, into = c("year", "month"), convert = TRUE) %>%
mutate(month = as.integer(sprintf("%02d", zoo::na.spline(month)))) %>%
mutate(year1 = ifelse(month > 12, year+trunc(month/12), year)) %>%
mutate(month1 = ifelse(month > 12 & month%%12!= 0, month%%12, month)) %>%
mutate(month1 = ifelse(month1 < 10, paste0(0,month1),month1)) %>%
unite("Period_2", year1, month1, sep = "-") %>%
select("ID","Period_1","Period_2","Bal","State")

A tidyverse solution based on zoo::na.spline. Note that it does not handle year changes. It's harder than I thought, especially because zoo::na.spline does not seem to work on yearmon format.
library(tidyr)
library(dplyr)
testDFextend %>%
separate(Period_2, into = c("year", "month"), convert = TRUE) %>%
fill(year) %>%
group_by(ID) %>%
mutate(month = sprintf("%02d", zoo::na.spline(month))) %>%
unite("Period_2", year, month, sep = "-")
output
ID Period_1 Period_2 Bal State
<dbl> <dbl> <chr> <dbl> <chr>
1 1 1 2012-06 10 XX
2 1 2 2012-07 10 AA
3 1 3 2012-08 10 BB
4 1 4 2012-09 10 CC
5 1 5 2012-10 10 XX
6 50 1 2013-06 21 AA
7 50 2 2013-07 22 BB
8 50 3 2013-08 23 CC
9 50 4 2013-09 23 CC
10 50 5 2013-10 23 CC
11 60 1 2012-01 36 SS
12 60 2 2012-02 35 XX
13 60 3 2012-03 34 AA
14 60 4 2012-04 34 AA
15 60 5 2012-05 34 AA

by ID you can strsplit the date, and take the elements to create a new data.frame to merge with.
ml <- max(with(testDF, tapply(ID, ID, length))) ## get max. period length
by(testDF, testDF$ID, \(x) {
sp <- strsplit(x$Period_2, '-')
s <- as.numeric(sp[[1]][[2]])
if (ml != nrow(x))
merge(x, data.frame(Period_2=paste0(sp[[1]][[1]], '-', sprintf('%02d', (s + nrow(x)):(s + ml - 1))),
Period_1=(nrow(x) + 1):ml,
ID=x$ID[nrow(x)], Bal=x$Bal[nrow(x)], State=x$State[nrow(x)]), all=TRUE)
else x
}) |> c(make.row.names=FALSE) |> do.call(what=rbind)
# ID Period_1 Period_2 Bal State
# 1 1 1 2012-06 10 XX
# 2 1 2 2012-07 10 AA
# 3 1 3 2012-08 10 BB
# 4 1 4 2012-09 10 CC
# 5 1 5 2012-10 10 XX
# 6 50 1 2013-06 21 AA
# 7 50 2 2013-07 22 BB
# 8 50 3 2013-08 23 CC
# 9 50 4 2013-09 23 CC
# 10 50 5 2013-10 23 CC
# 11 60 1 2012-01 36 SS
# 12 60 2 2012-02 35 XX
# 13 60 3 2012-03 34 AA
# 14 60 4 2012-04 34 AA
# 15 60 5 2012-05 34 AA
Edit
For older R versions (although it's recommended to always use update software), do:
do.call(c(by(testDF, testDF$ID, function(x) {
sp <- strsplit(x$Period_2, '-')
s <- as.numeric(sp[[1]][[2]])
if (ml != nrow(x))
merge(x, data.frame(Period_2=paste0(sp[[1]][[1]], '-', sprintf('%02d', (s + nrow(x)):(s + ml - 1))),
Period_1=(nrow(x) + 1):ml,
ID=x$ID[nrow(x)], Bal=x$Bal[nrow(x)], State=x$State[nrow(x)]), all=TRUE)
else x
}), make.row.names=FALSE), what=rbind)

For each ID convert Period_2 to yearmon class. This represents year and month without day. Internally it uses year + fraction where fraction = 0, 1/12, ..., 11/12 for the 12 months. Expand it out using seq. Then convert it back to character or omit the format line to keep the result as a yearmon object.
library(dplyr, exclude = c("filter", "lag"))
library(zoo)
testDFextend %>%
group_by(ID) %>%
mutate(Period_2 = as.yearmon(first(Period_2)) + seq(0, by=1/12, length=n())) %>%
mutate(Period_2 = format(Period_2, "%Y-%m")) %>%
ungroup
giving:
# A tibble: 15 × 5
ID Period_1 Period_2 Bal State
<dbl> <dbl> <chr> <dbl> <chr>
1 1 1 2012-06 10 XX
2 1 2 2012-07 10 AA
3 1 3 2012-08 10 BB
4 1 4 2012-09 10 CC
5 1 5 2012-10 10 XX
6 50 1 2013-06 21 AA
7 50 2 2013-07 22 BB
8 50 3 2013-08 23 CC
9 50 4 2013-09 23 CC
10 50 5 2013-10 23 CC
11 60 1 2012-01 36 SS
12 60 2 2012-02 35 XX
13 60 3 2012-03 34 AA
14 60 4 2012-04 34 AA
15 60 5 2012-05 34 AA

I think the nicest way to do this is to make use of the padr package, which is built to pad data.frames where there are missing/incomplete columns.
This uses grouping and cur_data() to make the correct date sequence in Period_2.
library(dplyr)
library(tidyr)
library(padr)
n_periods <- 5
testDF %>%
pad_int(end_val = n_periods , by = "Period_1", group = "ID") %>%
group_by(ID) %>%
mutate(Period_2 = as.Date(paste0(Period_2, "-01"))) %>%
mutate(Period_2 = seq(cur_data()$Period_2[1], by = "months", length.out =
n_periods) %>% format("%Y-%m")) %>%
fill(Bal, State) %>%
ungroup() %>%
select(ID, Period_1, Period_2, Bal, State)
ID Period_1 Period_2 Bal State
<dbl> <dbl> <chr> <dbl> <chr>
1 1 1 2012-06 10 XX
2 1 2 2012-07 10 AA
3 1 3 2012-08 10 BB
4 1 4 2012-09 10 CC
5 1 5 2012-10 10 XX
6 50 1 2013-06 21 AA
7 50 2 2013-07 22 BB
8 50 3 2013-08 23 CC
9 50 4 2013-09 23 CC
10 50 5 2013-10 23 CC
11 60 1 2012-01 36 SS
12 60 2 2012-02 35 XX
13 60 3 2012-03 34 AA
14 60 4 2012-04 34 AA
15 60 5 2012-05 34 AA
Note that this will handle cases when the year rolls over to the next year during Period_2.
Finally, you could adjust n_periods if you needed a different number of periods (or use a function to figure it out automatically, like jay.sf's answer).

Related

How to get this outcome in R

I have multiple data frames. Here I have demonstrated 3 data frames with different rows.
dat1<-read.table (text=" D Size1
A1 12
A2 18
A3 16
A4 14
A5 11
A6 0
Value1 25
Score1 30
", header=TRUE)
dat2<-read.table (text=" D Size2
S12 5
S13 9
S14 11
S15 12
S16 12
Value2 40
Score2 45
", header=TRUE)
dat3<-read.table (text=" D Size2
S17 0
S19 1
S22 2
S33 1
Value3 22
Score3 60
", header=TRUE)
I want to get the following outcome:
D Value Score
1 25 30
2 40 45
3 22 60
I need to get a data frame only for value and score
We may have to filter the rows after binding the datasets into a single data and then use pivot_wider to reshape back to wide
library(dplyr)
library(tidyr)
library(stringr)
bind_rows(dat1, dat2, dat3) %>%
filter(str_detect(D, '(Value|Score)\\d+')) %>%
separate(D, into = c("colnm", "D"), sep = "(?<=[a-z](?=\\d))") %>%
group_by(colnm, D) %>%
transmute(Score = coalesce(Size1, Size2)) %>%
ungroup %>%
pivot_wider(names_from = colnm, values_from = Score)
-output
# A tibble: 3 × 3
D Value Score
<chr> <int> <int>
1 1 25 30
2 2 40 45
3 3 22 60
Or an option in base R
do.call(rbind, Map(function(dat, y) data.frame(D = y,
Value = dat[[2]][grepl('Value', dat$D)],
Score = dat[[2]][grepl('Score', dat$D)]), list(dat1, dat2, dat3), 1:3))
D Value Score
1 1 25 30
2 2 40 45
3 3 22 60

R output BOTH maximum and minimum value by group in dataframe

Let's say I have a dataframe of Name and value, is there any ways to extract BOTH minimum and maximum values within Name in a single function?
set.seed(1)
df <- tibble(Name = rep(LETTERS[1:3], each = 3), Value = sample(1:100, 9))
# A tibble: 9 x 2
Name Value
<chr> <int>
1 A 27
2 A 37
3 A 57
4 B 89
5 B 20
6 B 86
7 C 97
8 C 62
9 C 58
The output should contains TWO columns only (Name and Value).
Thanks in advance!
You can use range to get max and min value and use it in summarise to get different rows for each Name.
library(dplyr)
df %>%
group_by(Name) %>%
summarise(Value = range(Value), .groups = "drop")
# Name Value
# <chr> <int>
#1 A 27
#2 A 57
#3 B 20
#4 B 89
#5 C 58
#6 C 97
If you have large dataset using data.table might be faster.
library(data.table)
setDT(df)[, .(Value = range(Value)), Name]
You can use dplyr::group_by() and dplyr::summarise() like this:
library(dplyr)
set.seed(1)
df <- tibble(Name = rep(LETTERS[1:3], each = 3), Value = sample(1:100, 9))
df %>%
group_by(Name) %>%
summarise(
maximum = max(Value),
minimum = min(Value)
)
This outputs:
# A tibble: 3 × 3
Name maximum minimum
<chr> <int> <int>
1 A 68 1
2 B 87 34
3 C 82 14
What's a little odd is that my original df object looks a little different than yours, in spite of the seed:
# A tibble: 9 × 2
Name Value
<chr> <int>
1 A 68
2 A 39
3 A 1
4 B 34
5 B 87
6 B 43
7 C 14
8 C 82
9 C 59
I'm currently using rbind() together with slice_min() and slice_max(), but I think it may not be the best way or the most efficient way when the dataframe contains millions of rows.
library(tidyverse)
rbind(df %>% group_by(Name) %>% slice_max(Value),
df %>% group_by(Name) %>% slice_min(Value)) %>%
arrange(Name)
# A tibble: 6 x 2
# Groups: Name [3]
Name Value
<chr> <int>
1 A 57
2 A 27
3 B 89
4 B 20
5 C 97
6 C 58
In base R, the output format can be created with tapply/stack - do a group by tapply to get the output as a named list or range, stack it to two column data.frame and change the column names if needed
setNames(stack(with(df, tapply(Value, Name, FUN = range)))[2:1], names(df))
Name Value
1 A 27
2 A 57
3 B 20
4 B 89
5 C 58
6 C 97
Using aggregate.
aggregate(Value ~ Name, df, range)
# Name Value.1 Value.2
# 1 A 1 68
# 2 B 34 87
# 3 C 14 82

Summarize values by group, but keep original data

I am trying to figure out how to sum values belonging to category a and b by factor file, but also keep the original data.
library(dplyr)
df <- data.frame(ID = 1:20, values = runif(20), category = rep(letters[1:5], 4), file = as.factor(sort(rep(1:5, 4))))
ID values category file
1 1 0.65699229 a 1
2 2 0.70506478 b 1
3 3 0.45774178 c 1
4 4 0.71911225 d 1
5 5 0.93467225 e 1
6 6 0.25542882 a 2
7 7 0.46229282 b 2
8 8 0.94001452 c 2
9 9 0.97822643 d 2
10 10 0.11748736 e 2
11 11 0.47499708 a 3
12 12 0.56033275 b 3
13 13 0.90403139 c 3
14 14 0.13871017 d 3
15 15 0.98889173 e 3
16 16 0.94666823 a 4
17 17 0.08243756 b 4
18 18 0.51421178 c 4
19 19 0.39020347 d 4
20 20 0.90573813 e 4
so that
df[1,2] will be added to df[2,2] to category 'ab' for file 1
df[6,2] will be added to df[7,2] to category 'ab' for file 2
etc.
So far I have this:
df %>%
filter(category %in% c('a' , 'b')) %>%
group_by(file) %>%
summarise(values = sum(values))
Problem
I would like to change the category of the summed values to "ab" and append it to the original data frame in the same pipeline.
Desired output:
ID values category file
1 1 0.65699229 a 1
2 2 0.70506478 b 1
3 3 0.45774178 c 1
4 4 0.71911225 d 1
5 5 0.93467225 e 1
6 6 0.25542882 a 2
7 7 0.46229282 b 2
8 8 0.94001452 c 2
9 9 0.97822643 d 2
10 10 0.11748736 e 2
11 11 0.47499708 a 3
12 12 0.56033275 b 3
13 13 0.90403139 c 3
14 14 0.13871017 d 3
15 15 0.98889173 e 3
16 16 0.94666823 a 4
17 17 0.08243756 b 4
18 18 0.51421178 c 4
19 19 0.39020347 d 4
20 20 0.90573813 e 4
21 21 1.25486225 ab 1
22 22 1.87216325 ab 2
23 23 1.36548126 ab 3
This will get you the result
df %>% bind_rows(
df %>%
filter(category %in% c('a' , 'b')) %>%
group_by(file) %>%
mutate(values = sum(values), category = paste0(category,collapse='')) %>%
filter(row_number() == 1 & n() > 1)
) %>% mutate(ID = row_number())
BTW the code pro produce the dataframe in the example is this one:
df <- data.frame(ID = 1:20, values = runif(20), category = rep(letters[1:5], 4), file = as.factor(sort(rep(1:4, 5))))
now lets say you want to sum multiple columns, you need to provide the list in a vector:
cols = c("values") # columns to be sum
df %>% bind_rows(
df %>%
filter(category %in% c('a' , 'b')) %>%
group_by(file) %>%
mutate_at(vars(cols), sum) %>%
mutate(category = paste0(category,collapse='')) %>%
filter(row_number() == 1 & n() > 1)
) %>% mutate(ID = row_number())
library(dplyr)
df1 %>%
filter(category %in% c('a' , 'b')) %>%
group_by(file) %>%
filter(n_distinct(category) > 1) %>%
summarise(values = sum(values)) %>%
mutate(category="ab",
ID=max(df1$ID)+1:n()) %>%
bind_rows(df1, .)
#> Warning in bind_rows_(x, .id): binding factor and character vector,
#> coercing into character vector
#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector
#> ID values category file
#> 1 1 0.62585921 a 1
#> 2 2 0.61865851 b 1
#> 3 3 0.05274456 c 1
#> 4 4 0.68156961 d 1
.
.
.
#> 19 19 0.43239411 d 5
#> 20 20 0.85886314 e 5
#> 21 21 1.24451773 ab 1
#> 22 22 0.99001810 ab 2
#> 23 23 1.25331943 ab 3
This data.table approach uses a self-join to get all of the possible two-character combinations.
library(data.table)
setDT(df)
df_self_join <- df[df, on = .(file), allow.cartesian = T
][category != i.category,
.(category = paste0(i.category, category), values = values + i.values, file)
][order(category), .(ID = .I + nrow(df), values, category, file)]
rbindlist(list(df, df_self_join))
ID values category file
1: 1 0.76984382 a 1
2: 2 0.54311583 b 1
3: 3 0.23462016 c 1
4: 4 0.60179043 d 1
...
20: 20 0.03534223 e 5
21: 21 1.31295965 ab 1
22: 22 0.51666175 ab 2
23: 23 1.02305754 ab 3
24: 24 1.00446399 ac 1
25: 25 0.96910373 ac 2
26: 26 0.87795389 ac 4
#total of 80 rows
Here is pretty close dplyr translation:
library(dplyr)
tib <- as_tibble(df)
inner_join(tib, tib, by = 'file')%>%
filter(ID.x != ID.y)%>%
transmute(category = paste0(category.x, category.y)
, values = values.x + values.y
, file)%>%
arrange(category)%>%
bind_rows(tib, .)%>%
mutate(ID = row_number())%>%
filter(category == 'ab') #filter added to show the "ab" files
# A tibble: 3 x 4
ID values category file
<int> <dbl> <chr> <fct>
1 21 1.31 ab 1
2 22 0.517 ab 2
3 23 1.02 ab 3

Referring to numbers in data table columns' names in a loop

I have a dataset like here:
customer_id <- c("1","1","1","2","2","2","2","3","3","3")
account_id <- as.character(c(11,11,11,55,55,55,55,38,38,38))
time <- c(as.Date("2017-01-01","%Y-%m-%d"), as.Date("2017-02-01","%Y-%m-%d"), as.Date("2017-03-01","%Y-%m-%d"),
as.Date("2017-12-01","%Y-%m-%d"), as.Date("2018-01-01","%Y-%m-%d"), as.Date("2018-02-01","%Y-%m-%d"),
as.Date("2018-03-01","%Y-%m-%d"), as.Date("2018-04-01","%Y-%m-%d"), as.Date("2018-05-01","%Y-%m-%d"),
as.Date("2018-06-01","%Y-%m-%d"))
tenor <- c(1,2,3,1,2,3,4,1,2,3)
variable_x <- c(87,90,100,120,130,150,12,13,15,14)
my_data <- data.table(customer_id,account_id,time,tenor,variable_x)
Now, I would like to create new variables "PD_Q1" up to "PD_Q20" that would equal to the value of "variable_x" when "tenor" is equal to 1 up to 20, i.e., PD_Q1 equal to variable_x's value if tenor = 1, PD_Q2 equal to variable_x's value if tenor = 2, etc. and I would like to do that by customer_id, account_id. I have the code for that, however only for PD_Q1 and I would like to make a loop that loops over i = 1:20 in which I change just tenor == i (this one is easy) and refer to columns PD_Qi in this loop, which is a problem for me. The code for one value of i is here:
my_data[tenor == 1, PD_Q1_temp := variable_x, by = c("customer_id", "account_id")]
list_accs <- my_data[tenor == 1, c("customer_id", "account_id", "PD_Q1_temp")]
list_accs <- unique(list_accs, by = c("customer_id", "account_id"))
names(list_accs) = c("customer_id", "account_id", "PD_Q1")
my_data = merge(x = my_data, y = list_accs, by = c("customer_id", "account_id"), all.x = TRUE)
my_data$PD_Q1_temp <- NULL
Now, can you please advise how to make a loop from 1 to 20, in which tenor, PD_Q1_temp and PD_Q1 would change? Specifically, I don't know how to refer to column names or variables using this i index within a loop.
The expected output for i = 1 and i = 2 (creating variables PD_Q1 and PD_Q2) is here:
> my_data
customer_id account_id time tenor variable_x PD_Q1 PD_Q2
1: 1 11 2017-01-01 1 87 87 90
2: 1 11 2017-02-01 2 90 87 90
3: 1 11 2017-03-01 3 100 87 90
4: 2 55 2017-12-01 1 120 120 130
5: 2 55 2018-01-01 2 130 120 130
6: 2 55 2018-02-01 3 150 120 130
7: 2 55 2018-03-01 4 12 120 130
8: 3 38 2018-04-01 1 13 13 15
9: 3 38 2018-05-01 2 15 13 15
10: 3 38 2018-06-01 3 14 13 15
now I want to create PD_Q3, PD_Q4 etc. in a loop using my code above that creates one such variable.
Can you show your expected output?
I think you can do what you want with tidyr::gather():
library(dplyr)
library(tidyr)
my_data %>%
tbl_df() %>%
select(-time) %>%
mutate(tenor = paste0("PD_Q", tenor)) %>%
spread(tenor, variable_x)
# # A tibble: 3 x 6
# customer_id account_id PD_Q1 PD_Q2 PD_Q3 PD_Q4
# <chr> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 1 11 87 90 100 NA
# 2 2 55 120 130 150 12
# 3 3 38 13 15 14 NA

pivot table in R

I have date dataframe which like that
id weight beginning_date end_date age categ_car
22 2 1960-06-02 1960-06-02 17 A
17 4 2001-07-02 19 B
I want the following dataframe
id weight beginning_date end_date age categ_car
22 2 1960-06-02 1960-06-02 17 A
22 2 1961-06-02 1961-06-02 18 A
17 4 2001-07-02 19 B
17 4 2002-07-02 20 B
17 4 2003-07-02 21 B
17 4 2004-07-02 22 B
I know that I can use the melt function from the package reshape 2 to create the pivot but I don't how I can increment date and age?
thank you,
N
Here is some help to get you going. You need to get the year from date columns, apply the same function for date columns, and bind them all after:
library(data.table)
setDT(df)
AddWeightage<-function(a,x){
x<-cumsum(rep(1,x-1))
return(x+a)
}
cols<-c("age")
df[,lapply(.SD,AddWeightage,x=weight), by=.(categ_car),.SDcols=cols]
Here is the function to generate date columns:
AddWeightDate<-function(a,x){
x<-cumsum(rep(1,x-1))
a1<-x+year(a)
b<-substr(as.character(a),5,10)
return(sprintf('%s%s',a1,b))
}
cols<-c('beginning_date',"end_date")
df3<-df[,lapply(.SD,AddWeightDate,x=weight), by=.(categ_car),.SDcols=cols]
We can use complete and fill from tidyr package to find a solution. Important point is to generate a sequence of dates (increment by 1 year) using %m+% operator from lubridate package.
library(dplyr)
library(tidyr)
library(lubridate)
df %>%
mutate(beginning_date = ymd(beginning_date), end_date = ymd(end_date)) %>%
group_by(id) %>%
complete(beginning_date = seq(beginning_date, beginning_date %m+% years(weight-1),
by="1 year")) %>%
fill(weight, end_date, age, categ_car) %>%
arrange(desc(id)) %>%
select(id, weight, beginning_date, end_date, age, categ_car)
# # A tibble: 6 x 6
# # Groups: id [2]
# id weight beginning_date end_date age categ_car
# <int> <int> <date> <date> <int> <chr>
# 1 22 2 1960-06-02 1960-06-02 17 A
# 2 22 2 1961-06-02 1960-06-02 17 A
# 3 17 4 2001-07-02 NA 19 B
# 4 17 4 2002-07-02 NA 19 B
# 5 17 4 2003-07-02 NA 19 B
# 6 17 4 2004-07-02 NA 19 B
Update: Based on feedback from OP to handler multiple begining_date for same 'id`:
df %>%
mutate(beginning_date = ymd(beginning_date), end_date = ymd(end_date)) %>%
group_by(id) %>%
complete(beginning_date = seq(as.Date(min(beginning_date), origin="1970-01-01"),
as.Date(min(beginning_date), origin="1970-01-01") %m+% years(weight-1),
by="1 year")) %>%
fill(weight, end_date, age, categ_car) %>%
arrange(desc(id)) %>%
select(id, weight, beginning_date, end_date, age, categ_car)
Data
df <- read.table(text =
"id weight beginning_date end_date age categ_car
22 2 1960-06-02 1960-06-02 17 A
17 4 2001-07-02 NA 19 B",
header = TRUE, stringsAsFactors = FALSE)
Note: NA has been used instead of blank value for end_date.

Resources