Substitute values across different data frames in R - r

I have the following code for 4 dataframes. The last column of each has only 2 values, either zero ("0") or an id, which is the same within every df, but differs between every df.
How can substitute all zeros in the id columns for all the same ids?
As example, change df1 from:
year counts id
1 2015 0 0
2 2016 0 0
3 2017 7 Fg4s5
4 2018 8 Fg4s5
5 2019 5 0
6 2020 12 Fg4s5
to:
year counts id
1 2015 0 Fg4s5
2 2016 0 Fg4s5
3 2017 7 Fg4s5
4 2018 8 Fg4s5
5 2019 5 Fg4s5
6 2020 12 Fg4s5
Same for other dfs with their ids.
Code for dataframes:
df1 <- data.frame(
year = c(2015:2020),
counts = c(0, 0, 7, 8, 5, 12),
id = c(0, 0, "Fg4s5", "Fg4s5", 0, "Fg4s5")
)
df2 <- data.frame(
year = c(2014:2020),
counts = c(1, 5, 9, 2, 2, 19, 3),
id = c(0, 0, 0, 0, 0, "Qd8a2", "Qd8a2")
)
df3 <- data.frame(
year = c(2016:2020),
counts = c(0, 0, 0, 0, 6),
id = c(0, 0, "Wk9l4", "Wk9l4", "Wk9l4")
)
df4 <- data.frame(
year = c(2014:2020),
counts = c(0, 0, 8, 1, 9, 12, 23),
id = c(0, "Rd7q0", 0, 0, "Rd7q0", "Rd7q0", "Rd7q0")
)

Put the dataframes in a list and change the value in id columns using lapply :
list_df <- list(df1, df2, df3, df4)
lapply(list_df, function(x) {
transform(x, id = replace(id, id == 0, id[id != '0'][1]))
}) -> list_df
list_df
#[[1]]
# year counts id
#1 2015 0 Fg4s5
#2 2016 0 Fg4s5
#3 2017 7 Fg4s5
#4 2018 8 Fg4s5
#5 2019 5 Fg4s5
#6 2020 12 Fg4s5
#[[2]]
# year counts id
#1 2014 1 Qd8a2
#2 2015 5 Qd8a2
#3 2016 9 Qd8a2
#4 2017 2 Qd8a2
#5 2018 2 Qd8a2
#6 2019 19 Qd8a2
#7 2020 3 Qd8a2
#[[3]]
# year counts id
#1 2016 0 Wk9l4
#2 2017 0 Wk9l4
#3 2018 0 Wk9l4
#4 2019 0 Wk9l4
#5 2020 6 Wk9l4
#[[4]]
# year counts id
#1 2014 0 Rd7q0
#2 2015 0 Rd7q0
#3 2016 8 Rd7q0
#4 2017 1 Rd7q0
#5 2018 9 Rd7q0
#6 2019 12 Rd7q0
#7 2020 23 Rd7q0
To put them in separate dataframes.
names(list_df) <- paste0('df', 1:4)
list2env(list_df, .GlobalEnv)

using purrr::map
map(list(df1, df2, df3, df4), ~ .x %>% mutate(id = first(id[id != "0"])))
[[1]]
year counts id
1 2015 0 Fg4s5
2 2016 0 Fg4s5
3 2017 7 Fg4s5
4 2018 8 Fg4s5
5 2019 5 Fg4s5
6 2020 12 Fg4s5
[[2]]
year counts id
1 2014 1 Qd8a2
2 2015 5 Qd8a2
3 2016 9 Qd8a2
4 2017 2 Qd8a2
5 2018 2 Qd8a2
6 2019 19 Qd8a2
7 2020 3 Qd8a2
[[3]]
year counts id
1 2016 0 Wk9l4
2 2017 0 Wk9l4
3 2018 0 Wk9l4
4 2019 0 Wk9l4
5 2020 6 Wk9l4
[[4]]
year counts id
1 2014 0 Rd7q0
2 2015 0 Rd7q0
3 2016 8 Rd7q0
4 2017 1 Rd7q0
5 2018 9 Rd7q0
6 2019 12 Rd7q0
7 2020 23 Rd7q0

Related

Count the occurences of accidents until the next accidents

I have the following data frame and I would like to create the "OUTPUT_COLUMN".
Explanation of columns:
ID is the identification number of the policy
ID_REG_YEAR is the identification number per Registration Year
CALENDAR_YEAR is the year that the policy have exposure
NUMBER_OF_RENEWALS is the count of numbers that the policy has renewed
ACCIDENT is accident occurred
KEY TO THE DATASET: ID_REG_YEAR and CALENDAR_YEAR
Basically, if column NUMBER_OF_RENEWALS = 0 then OUTPUT_COLUMN = 100. Any rows that an accident did not occurred before should contain 100 (e.g rows 13,16,17). If an Accident occured I would like to count the number of renewals until the next accident.
ID ID_REG_YEAR CALENDAR_YEAR NUMBER_OF_RENEWALS ACCIDENT OUTPUT_COLUMN
1 A A_2015 2015 0 YES 100
2 A A_2015 2016 0 YES 100
3 A A_2016 2016 1 YES 0
4 A A_2016 2017 1 YES 0
5 A A_2017 2017 2 NO 1
6 A A_2017 2018 2 NO 1
7 A A_2018 2018 3 NO 2
8 A A_2018 2019 3 NO 2
9 A A_2019 2019 4 YES 0
10 A A_2019 2020 4 YES 0
11 B B_2015 2015 0 NO 100
12 B B_2015 2016 0 NO 100
13 B B_2016 2016 1 NO 100
14 C C_2013 2013 0 NO 100
15 C C_2013 2014 0 NO 100
16 C C_2014 2014 1 NO 100
17 C C_2014 2015 1 NO 100
18 C C_2015 2015 2 YES 0
19 C C_2015 2016 2 YES 0
20 C C_2016 2016 3 NO 1
21 C C_2016 2017 3 NO 1
22 C C_2017 2017 4 NO 2
23 C C_2017 2018 4 NO 2
24 C C_2018 2018 5 YES 0
25 C C_2018 2019 5 YES 0
26 C C_2019 2019 6 NO 1
27 C C_2019 2020 6 NO 1
28 C C_2020 2020 7 NO 2
Here is a dplyr solution. First, obtain a separate column for the registration year, which will be used to calculate renewals since prior accident (assumes this is years since last accident). Then, create a column to contain the year of the last accident after grouping by ID. Using fill this value will be propagated. The final outcome column will be set as either 100 (if no prior accident, or NUMBER_OF_RENEWALS is zero) vs. the registration year - last accident year.
library(dplyr)
df %>%
separate(ID_REG_YEAR, into = c("ID_REG", "REG_YEAR"), convert = T) %>%
group_by(ID) %>%
mutate(LAST_ACCIDENT = ifelse(ACCIDENT == "YES", REG_YEAR, NA_integer_)) %>%
fill(LAST_ACCIDENT, .direction = "down") %>%
mutate(OUTPUT_COLUMN_2 = ifelse(
is.na(LAST_ACCIDENT) | NUMBER_OF_RENEWALS == 0, 100, REG_YEAR - LAST_ACCIDENT
))
Output
ID ID_REG REG_YEAR CALENDAR_YEAR NUMBER_OF_RENEWALS ACCIDENT OUTPUT_COLUMN LAST_ACCIDENT OUTPUT_COLUMN_2
<chr> <chr> <int> <int> <int> <chr> <int> <int> <dbl>
1 A A 2015 2015 0 YES 100 2015 100
2 A A 2015 2016 0 YES 100 2015 100
3 A A 2016 2016 1 YES 0 2016 0
4 A A 2016 2017 1 YES 0 2016 0
5 A A 2017 2017 2 NO 1 2016 1
6 A A 2017 2018 2 NO 1 2016 1
7 A A 2018 2018 3 NO 2 2016 2
8 A A 2018 2019 3 NO 2 2016 2
9 A A 2019 2019 4 YES 0 2019 0
10 A A 2019 2020 4 YES 0 2019 0
# … with 18 more rows
Note: If you want to use your policy number (NUMBER_OF_RENEWALS) and not go by the year, you can do something similar. Instead of adding a column with the last accident year, you can include the last accident policy. Then, your output column could reflect the policy number instead of year (to consider the possibility that one or more years could be skipped).
df %>%
separate(ID_REG_YEAR, into = c("ID_REG", "REG_YEAR"), convert = T) %>%
group_by(ID) %>%
mutate(LAST_ACCIDENT_POLICY = ifelse(ACCIDENT == "YES", NUMBER_OF_RENEWALS, NA_integer_)) %>%
fill(LAST_ACCIDENT_POLICY, .direction = "down") %>%
mutate(OUTPUT_COLUMN_2 = ifelse(
is.na(LAST_ACCIDENT_POLICY) | NUMBER_OF_RENEWALS == 0, 100, NUMBER_OF_RENEWALS - LAST_ACCIDENT_POLICY
))

How to find first non-NA leading or lagging value?

I have rows grouped by ID and I want to calculate how much time passes until the next event occurs (if it does occur for that ID).
Here is example code:
year <- c(2015, 2016, 2017, 2018, 2015, 2016, 2017, 2018, 2015, 2016, 2017, 2018)
id <- c(rep("A", times = 4), rep("B", times = 4), rep("C", times = 4))
event_date <- c(NA, 2016, NA, 2018, NA, NA, NA, NA, 2015, NA, NA, 2018)
df<- as.data.frame(cbind(id, year, event_date))
df
id year event_date
1 A 2015 <NA>
2 A 2016 2016
3 A 2017 <NA>
4 A 2018 2018
5 B 2015 <NA>
6 B 2016 <NA>
7 B 2017 <NA>
8 B 2018 <NA>
9 C 2015 2015
10 C 2016 <NA>
11 C 2017 <NA>
12 C 2018 2018
Here is what I want the output to look like:
id year event_date years_till_next_event
1 A 2015 <NA> 1
2 A 2016 2016 0
3 A 2017 <NA> 1
4 A 2018 2018 0
5 B 2015 <NA> <NA>
6 B 2016 <NA> <NA>
7 B 2017 <NA> <NA>
8 B 2018 <NA> <NA>
9 C 2015 2015 0
10 C 2016 <NA> 2
11 C 2017 <NA> 1
12 C 2018 2018 0
Person B does not have the event, so it is not calculated. For the others, I want to calculate the difference between the leading event_date (ignoring NAs, if it exists) and the year.
I want to calculate years_till_next_event such that 1) if there is an event_date for a row, event_date - year. 2) If not, then return the first non-NA leading value - year. I'm having difficulty with the 2nd part of the logic, keeping in mind the event could occur not at all or every year, by ID.
Using zoo with dplyr
library(dplyr)
library(zoo)
df %>%
group_by(id) %>%
mutate(years_till_next_event = na.locf0(event_date, fromLast = TRUE) - year )
Here is a data.table option
setDT(df)[, years_till_next_event := nafill(event_date, type = "nocb") - year, id]
which gives
id year event_date years_till_next_event
1: A 2015 NA 1
2: A 2016 2016 0
3: A 2017 NA 1
4: A 2018 2018 0
5: B 2015 NA NA
6: B 2016 NA NA
7: B 2017 NA NA
8: B 2018 NA NA
9: C 2015 2015 0
10: C 2016 NA 2
11: C 2017 NA 1
12: C 2018 2018 0
You can create a new column to assign a row number within each id if the value is not NA, fill the NA values from the next values and subtract the current row number from it.
library(dplyr)
df %>%
group_by(id) %>%
mutate(years_till_next_event = replace(row_number(),is.na(event_date), NA)) %>%
tidyr::fill(years_till_next_event, .direction = 'up') %>%
mutate(years_till_next_event = years_till_next_event - row_number()) %>%
ungroup
# id year event_date years_till_next_event
# <chr> <dbl> <dbl> <int>
# 1 A 2015 NA 1
# 2 A 2016 2016 0
# 3 A 2017 NA 1
# 4 A 2018 2018 0
# 5 B 2015 NA NA
# 6 B 2016 NA NA
# 7 B 2017 NA NA
# 8 B 2018 NA NA
# 9 C 2015 2015 0
#10 C 2016 NA 2
#11 C 2017 NA 1
#12 C 2018 2018 0
data
df <- data.frame(id, year, event_date)

R: Use data frame names for columns after/before applying purrr reduce

I already checked this solution, but unfortunately, it does not fit my more complex data.
Raw Data:
I have a list named Total.Scores with eleven data frames named
2000-2020 each is containing annual data from 2000 till 2020. Each data frame has a different number of rows but always 12 columns: ID, Category, Score.1-9, and Year.
Sample Data:
library(purrr)
Total.Scores <- list("2020" = data.frame(ID = c("A2_101", "B3_102", "LO_103", "TT_101"),
Category = c("blue", "red", "green", "red"),
Score.1 = c(1,2,3,0),
Score.2 = c(3,4,5,2),
Score.3 = c(0,0,1,1),
Year = c(2020, 2020, 2020, 2020)),
"2019" = data.frame(ID = c("A2_101", "B3_102", "LO_103"),
Category = c("blue", "red", "green"),
Score.1 = c(1,2,3),
Score.2 = c(3,4,5),
Score.3 = c(0,0,1),
Year = c(2019, 2019, 2019)),
"2018" = data.frame(ID = c("A2_101", "B3_102", "LO_103", "TT_201","AA_345"),
Category = c("blue", "red", "green", "yellow", "purple"),
Score.1 = c(1,2,3,3,5),
Score.2 = c(3,4,5,5,3),
Score.3 = c(0,0,1,3,0),
Year = c(2018, 2018, 2018, 2018, 2018)),
"2017" = data.frame(ID = c("A2_101", "B3_102", "LO_103", "TT_101"),
Category = c("blue", "red", "green", "red"),
Score.1 = c(1,2,3,0),
Score.2 = c(3,4,5,2),
Score.3 = c(0,0,1,1),
Year = c(2017, 2017, 2017, 2017)))
Joined Data:
I combine the data frames from the Total.Scores list into the new large data frame Total.Yearly.Scores via a full_join by ID and Category:
Total.Yearly.Scores <- Total.Scores %>% reduce(full_join, by = c("ID", "Category"))
Result:
# Total.Yearly.Scores
ID Category Score.1.x Score.2.x Score.3.x Year.x Score.1.y Score.2.y Score.3.y Year.y Score.1.x.x Score.2.x.x Score.3.x.x Year.x.x
1 A2_101 blue 1 3 0 2020 1 3 0 2019 1 3 0 2018
2 B3_102 red 2 4 0 2020 2 4 0 2019 2 4 0 2018
3 LO_103 green 3 5 1 2020 3 5 1 2019 3 5 1 2018
4 TT_101 red 0 2 1 2020 NA NA NA NA NA NA NA NA
5 TT_201 yellow NA NA NA NA NA NA NA NA 3 5 3 2018
6 AA_345 purple NA NA NA NA NA NA NA NA 5 3 0 2018
Score.1.y.y Score.2.y.y Score.3.y.y Year.y.y
1 1 3 0 2017
2 2 4 0 2017
3 3 5 1 2017
4 0 2 1 2017
5 NA NA NA NA
6 NA NA NA NA
Question:
How can I adjust my code so that the column headers for the Score.1-9 and Year columns incorporate the data frame names of 2000-2020?
For example, changing them from Score.1.x to Score.1 2020:
# Total.Yearly.Scores
ID Category Score.1 2020 Score.2 2020 Score.3 2020 Year 2020 Score.1 2019 Score.2 2019 Score.3 2019 Year 2019 Score.1 2018 Score.2 2018 Score.3 2018 Year 2018
1 A2_101 blue 1 3 0 2020 1 3 0 2019 1 3 0 2018
2 B3_102 red 2 4 0 2020 2 4 0 2019 2 4 0 2018
3 LO_103 green 3 5 1 2020 3 5 1 2019 3 5 1 2018
4 TT_101 red 0 2 1 2020 NA NA NA NA NA NA NA NA
5 TT_201 yellow NA NA NA NA NA NA NA NA 3 5 3 2018
6 AA_345 purple NA NA NA NA NA NA NA NA 5 3 0 2018
Score.1 2017 Score.2 2017 Score.3 2017 Year 2017
1 1 3 0 2017
2 2 4 0 2017
3 3 5 1 2017
4 0 2 1 2017
5 NA NA NA NA
6 NA NA NA NA
Thanks in advance for the help!
Best regards, Thomas.
We can rename before the join
library(dplyr)
library(purrr)
library(stringr)
Total.Scores %>%
imap(~ {nm1 <- .y
rename_at(.x, vars(-c("ID", "Category")), ~ str_c(., nm1, sep= ' '))}) %>%
reduce(full_join, by = c("ID", "Category"))
-output
ID Category Score.1 2020 Score.2 2020 Score.3 2020 Year 2020 Score.1 2019 Score.2 2019 Score.3 2019
1 A2_101 blue 1 3 0 2020 1 3 0
2 B3_102 red 2 4 0 2020 2 4 0
3 LO_103 green 3 5 1 2020 3 5 1
4 TT_101 red 0 2 1 2020 NA NA NA
5 TT_201 yellow NA NA NA NA NA NA NA
6 AA_345 purple NA NA NA NA NA NA NA
Year 2019 Score.1 2018 Score.2 2018 Score.3 2018 Year 2018 Score.1 2017 Score.2 2017 Score.3 2017 Year 2017
1 2019 1 3 0 2018 1 3 0 2017
2 2019 2 4 0 2018 2 4 0 2017
3 2019 3 5 1 2018 3 5 1 2017
4 NA NA NA NA NA 0 2 1 2017
5 NA 3 5 3 2018 NA NA NA NA
6 NA 5 3 0 2018 NA NA NA NA

Inserting rows into a table

I have this table (visit_ts) -
Year Month Number_of_visits
2011 4 1
2011 6 3
2011 7 23
2011 12 32
2012 1 123
2012 11 3200
The aim is to insert rows with Number_of_visits as 0, for months which are missing in the table.
Do not insert rows for 2011 where month is 1,2,3 or 2012 where month is 12.
The following code works correctly -
vec_month=c(1,2,3,4,5,6,7,8,9,10,11,12)
vec_year=c(2011,2012,2013,2014,2015,2016)
i=1
startyear=head(visit_ts$Year,n=1)
endyear=tail(visit_ts$Year,n=1)
x=head(visit_ts$Month,n=1)
y=tail(visit_ts$Month,n=1)
for (year in vec_year)
{
if(year %in% visit_ts$Year)
{
a=subset(visit_ts,visit_ts$Year==year)
index= which(!vec_month %in% a$Month)
for (j in index)
{
if((year==startyear & j>x )|(year==endyear & j<y))
visit_ts=rbind(visit_ts,c(year,j,0))
else
{
if(year!=startyear & year!=endyear)
visit_ts=rbind(visit_ts,c(year,j,0))
}
}}
else
{
i=i+1
}}
As I am new to R I am looking for an alternative/better solution to the problem which would not involve hard-coding the year and month vectors. Also please feel free to point out best programming practices.
We can use expand.grid with merge or left_join
library(dplyr)
expand.grid(Year = min(df1$Year):max(df1$Year), Month = 1:12) %>%
filter(!(Year == min(df1$Year) & Month %in% 1:3|
Year == max(df1$Year) & Month == 12)) %>%
left_join(., df1) %>%
mutate(Number_of_visits=replace(Number_of_visits, is.na(Number_of_visits), 0))
# Year Month Number_of_visits
#1 2012 1 123
#2 2012 2 0
#3 2012 3 0
#4 2011 4 1
#5 2012 4 0
#6 2011 5 0
#7 2012 5 0
#8 2011 6 3
#9 2012 6 0
#10 2011 7 23
#11 2012 7 0
#12 2011 8 0
#13 2012 8 0
#14 2011 9 0
#15 2012 9 0
#16 2011 10 0
#17 2012 10 0
#18 2011 11 0
#19 2012 11 3200
#20 2011 12 32
We can make it more dynamic by grouping by 'Year', get the sequence of 'Month' from minimum to maximum in a list, unnest the column, join with the original dataset (left_join) and replace the NA values with 0.
library(tidyr)
df1 %>%
group_by(Year) %>%
summarise(Month = list(min(Month):max(Month))) %>%
unnest(Month) %>%
left_join(., df1) %>%
mutate(Number_of_visits=replace(Number_of_visits, is.na(Number_of_visits), 0))
# Year Month Number_of_visits
# <int> <int> <dbl>
#1 2011 4 1
#2 2011 5 0
#3 2011 6 3
#4 2011 7 23
#5 2011 8 0
#6 2011 9 0
#7 2011 10 0
#8 2011 11 0
#9 2011 12 32
#10 2012 1 123
#11 2012 2 0
#12 2012 3 0
#13 2012 4 0
#14 2012 5 0
#15 2012 6 0
#16 2012 7 0
#17 2012 8 0
#18 2012 9 0
#19 2012 10 0
#20 2012 11 3200
Or another option is data.table. Convert the 'data.frame' to 'data.table' (setDT(df1)), grouped by 'Year', we get the sequence of min to max 'Month', join with the original dataset on 'Year' and 'Month', replace the NA values to 0.
library(data.table)
setDT(df1)
df1[df1[, .(Month=min(Month):max(Month)), Year],
on = c("Year", "Month")][is.na(Number_of_visits), Number_of_visits := 0][]
# Year Month Number_of_visits
# 1: 2011 4 1
# 2: 2011 5 0
# 3: 2011 6 3
# 4: 2011 7 23
# 5: 2011 8 0
# 6: 2011 9 0
# 7: 2011 10 0
# 8: 2011 11 0
# 9: 2011 12 32
#10: 2012 1 123
#11: 2012 2 0
#12: 2012 3 0
#13: 2012 4 0
#14: 2012 5 0
#15: 2012 6 0
#16: 2012 7 0
#17: 2012 8 0
#18: 2012 9 0
#19: 2012 10 0
#20: 2012 11 3200

data standardization for all group data.frame in R

I have a dataset as below
Date <- rep(c("Jan", "Feb"), 3)[1:5]
Group <- c(rep(letters[1:2],each=2),"c")
value <- sample(1:10,5)
data <- data.frame(Date, Group, value)
> data
Date Group value
1 Jan a 2
2 Feb a 7
3 Jan b 3
4 Feb b 9
5 Jan c 1
As you can observed, for group c it do not have data on Date=Feb.
How can i make a dataset such that
> DATA
Date Group value
1 Jan a 2
2 Feb a 7
3 Jan b 3
4 Feb b 9
5 Jan c 1
6 Feb c 0
I have added last row such that value for group c in feb is 0.
Thanks
With base R you can use xtabs wrapped in as.data.frame:
as.data.frame(xtabs(formula = value ~ Date + Group, data = data))
# Date Group Freq
#1 Feb a 8
#2 Jan a 6
#3 Feb b 4
#4 Jan b 1
#5 Feb c 0
#6 Jan c 10
Using merge:
#get all combinations of 2 columns
all.comb <- expand.grid(unique(data$Date),unique(data$Group))
colnames(all.comb) <- c("Date","Group")
#merge with all.x=TRUE to keep nonmatched rows
res <- merge(all.comb,data,all.x=TRUE)
#convert NA to 0
res$value[is.na(res$value)] <- 0
#result
res
# Date Group value
# 1 Feb a 3
# 2 Feb b 4
# 3 Feb c 0
# 4 Jan a 5
# 5 Jan b 7
# 6 Jan c 10
Using reshape2
library(reshape2)
melt(dcast(data, Date~Group, value.var="value",fill=0), id.var="Date") #values differ as there was no set.seed()
# Date variable value
#1 Feb a 1
#2 Jan a 10
#3 Feb b 7
#4 Jan b 4
#5 Feb c 0
#6 Jan c 5
Or using dplyr
library(dplyr)
library(tidyr)
data%>%
spread(Group, value, fill=0) %>%
gather(Group, value, a:c)
# Date Group value
#1 Feb a 1
#2 Jan a 10
#3 Feb b 7
#4 Jan b 4
#5 Feb c 0
#6 Jan c 5

Resources