Replace NA between two values without loop - r

I have the following data frame:
data <- structure(list(Date = structure(c(-17897, -17896, -17895, -17894,
-17893, -17892, -17891, -17890, -17889, -17888, -17887, -17887,
-17886, -17885, -17884, -17883, -17882, -17881, -17880, -17879,
-17878, -17877, -17876, -17875, -17874, -17873, -17872, -17871,
-17870, -17869, -17868, -17867, -17866, -17865, -17864), class = "Date"),
duration = c(NA, NA, NA, 5, NA, NA, NA, 5, NA, NA, 1, 1,
NA, NA, 3, NA, 3, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 4, NA, NA, 4, NA, NA), name = c(NA, NA, NA, "Date_beg",
NA, NA, NA, "Date_end", NA, NA, "Date_beg", "Date_end", NA,
NA, "Date_beg", NA, "Date_end", NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, "Date_beg", NA, NA, "Date_end", NA, NA
)), row.names = c(NA, -35L), class = c("tbl_df", "tbl", "data.frame"
))
And looks like:
Date duration name
<date> <dbl> <chr>
1 1921-01-01 NA NA
2 1921-01-02 NA NA
3 1921-01-03 NA NA
4 1921-01-04 5 Date_beg
5 1921-01-05 NA NA
6 1921-01-06 NA NA
7 1921-01-07 NA NA
8 1921-01-08 5 Date_end
9 1921-01-09 NA NA
10 1921-01-10 NA NA
...
I want to replace the NA values in column name that are between rows with Date_beg and Date_end with the word "event".
I have tried this:
data %<>% mutate(name = ifelse(((lag(name) == 'Date_beg')|(lag(name) == 'event')) &
But only the first row after Date_beg changes. It is quite easy with a for-loop, but I wanted to use a more R-like method.

There is probably a better way using data.table::nafill, but as you're using tidyverse functions, I would do it by creating an extra event column using tidyr::fill and then pulling it through to the name column where name is NA:
library(tidyr)
data %>%
mutate(
events = ifelse(
fill(data, name)$name == "Date_beg",
"event",
NA),
name = coalesce(name, events)
) %>%
select(-events)

You can do it by looking at the indices where there have been more "Date_beg" than "Dat_end" with:
data$name[lag(cumsum(data$name == "Date_beg" & !is.na(data$name))) -
cumsum(data$name == "Date_end" & !is.na(data$name)) >0] <- "event"
print(data, n=20)
# # A tibble: 35 x 3
# Date duration name
# <date> <dbl> <chr>
# 1 1921-01-01 NA NA
# 2 1921-01-02 NA NA
# 3 1921-01-03 NA NA
# 4 1921-01-04 5 Date_beg
# 5 1921-01-05 NA event
# 6 1921-01-06 NA event
# 7 1921-01-07 NA event
# 8 1921-01-08 5 Date_end
# 9 1921-01-09 NA NA
# 10 1921-01-10 NA NA
# 11 1921-01-11 1 Date_beg
# 12 1921-01-11 1 Date_end
# 13 1921-01-12 NA NA
# 14 1921-01-13 NA NA
# 15 1921-01-14 3 Date_beg
# 16 1921-01-15 NA event
# 17 1921-01-16 3 Date_end
# 18 1921-01-17 NA NA
# 19 1921-01-18 NA NA
# 20 1921-01-19 NA NA
# # ... with 15 more rows
Lagging the first index by one is required so that you don't overwrite the "Date_beg" at the start of each run.

Another dplyr approach using the cumsum function.
If the row in the name column in NA, it'll add 0 to the cumsum, otherwise add 1. Therefore the values under Date_beg will always be odd numbers (0 + 1) and the values under Date_end will always be even numbers (0 + 1 + 1). Then replace values that are odd in the ref column AND not NA in the name column with "event".
library(dplyr)
data %>%
mutate(ref = cumsum(ifelse(is.na(name), 0, 1)),
name = ifelse(ref %% 2 == 1 & is.na(name), "event", name)) %>%
select(-ref)

Related

How to filter only subjects observed more than once in panel data with R?

I am analyzing panel data with R now, and the data format is as follows.
pid wave edu marri rela age apt sido dongy urban stat1 stat2 exer dep3 bmi mmse
1 3122 1 2 <NA> NA NA <NA> NA <NA> <NA> <NA> <NA> <NA> <NA> NA <NA>
2 3122 1 NA 1 NA NA <NA> NA <NA> <NA> <NA> <NA> <NA> <NA> NA <NA>
3 3122 1 NA <NA> 3 NA <NA> NA <NA> <NA> <NA> <NA> <NA> <NA> NA <NA>
4 3122 1 NA <NA> NA 71 <NA> NA <NA> <NA> <NA> <NA> <NA> <NA> NA <NA>
5 3122 1 NA <NA> NA NA 1 NA <NA> <NA> <NA> <NA> <NA> <NA> NA <NA>
6 3122 1 NA <NA> NA NA <NA> 11 <NA> <NA> <NA> <NA> <NA> <NA> NA <NA>
The data are repeated measurements, and there are many missing values. If only the observed values are left at every year, the loss of the number is large, so I want to select and analyze only subjects who have been measured more than once among the 'mmse' variables.
I tried to check the change of the variable of interest through the following code, but it didn't work.
df %>%
arrange(pid, wave) %>%
group_by(pid) %>%
mutate(
mmse_change = mmse - lag(mmse),
mmse_increase = mmse_change > 0,
mmse_decrease = mmse_change < 0
)
I need the above object to analyze the baseline characteristic. How can I extract subjects with this condition?
We could do something like this:
df %>%
filter(!is.na(mmse)) %>% # just keep rows with non-NA in mmse
count(pid) %>% # count how many observations per pid
filter(n > 1) %>% # keep those pid's appearing more than once
select(pid) %>% # just keep the pid column
left_join(df) # get `df` for just those pid's
Another approach without join is to group_by(pid) and then filter all groups where max(row_number()) > 1.
Below I changed your initial data so that it can be used for this problem (your original data has only NAs in mmse and please put your data in reproducible code next).
library(tidyverse)
# initial data slightly changed:
df <- tribble(~pid, ~wave, ~edu, ~marri, ~rela, ~age, ~apt, ~sido, ~dongy, ~urban, ~stat1, ~stat2, ~exer, ~dep3, ~bmi, ~mmse,
3122 , 1, 2, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1,
3122 , 1, NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
3122 , 1, NA, NA, 3, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 2,
3122 , 1, NA, NA, NA, 71, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
3122 , 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, 3,
3124 , 1, NA, NA, NA, NA, NA, 11, NA, NA, NA, NA, NA, NA, NA, 5)
df %>%
filter(!is.na(mmse)) %>%
group_by(pid) %>%
filter(max(row_number()) > 1) %>%
ungroup()
#> # A tibble: 3 x 16
#> pid wave edu marri rela age apt sido dongy urban stat1 stat2 exer
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl> <lgl> <lgl> <lgl> <lgl>
#> 1 3122 1 2 NA NA NA NA NA NA NA NA NA NA
#> 2 3122 1 NA NA 3 NA NA NA NA NA NA NA NA
#> 3 3122 1 NA NA NA NA 1 NA NA NA NA NA NA
#> # ... with 3 more variables: dep3 <lgl>, bmi <lgl>, mmse <dbl>
Created on 2022-09-21 by the reprex package (v2.0.1)

Merge survey columns across variables in R

I am analyzing a very large survey in which I want to combine four parts of the survey, through several combinations of 4 questions. Below I have created a small example. A little background: a respondent either answered q2, q5, q8 or q9, because they only filled in 1 of 4 parts of the survey based on their answer in q1 (not shown here).Therefore, only one of the four columns contains an answer (1 or 2), while the others contain NAs. q2, q5, q8, q9 are similar questions that have the same answer options, which is why I want to combine them to make my dataset less wide and make it easier to further analyze the data.
q2_1 <- c(NA, NA, NA, NA, NA, NA, rep(c(1:2), 1))
q5_1 <- c(NA, NA, NA, NA, rep(c(1:2), 1), NA, NA)
q8_1 <- c(NA, NA, rep(c(1:2), 1), NA, NA, NA, NA)
q9_1 <- c(rep(c(1:2), 1), NA, NA, NA, NA, NA, NA)
q2_2 <- c(NA, NA, NA, NA, NA, NA, rep(c(1:2), 1))
q5_2 <- c(NA, NA, NA, NA, rep(c(1:2), 1), NA, NA)
q8_2 <- c(NA, NA, rep(c(1:2), 1), NA, NA, NA, NA)
q9_2 <- c(rep(c(1:2), 1), NA, NA, NA, NA, NA, NA)
df <- data.frame(q2_1, q5_1, q8_1, q9_1, q2_2, q5_2, q8_2, q9_2)
df
# running df shows:
q2_1 q5_1 q8_1 q9_1 q2_2 q5_2 q8_2 q9_2
1 NA NA NA 1 NA NA NA 1
2 NA NA NA 2 NA NA NA 2
3 NA NA 1 NA NA NA 1 NA
4 NA NA 2 NA NA NA 2 NA
5 NA 1 NA NA NA 1 NA NA
6 NA 2 NA NA NA 2 NA NA
7 1 NA NA NA 1 NA NA NA
8 2 NA NA NA 2 NA NA NA
My desired end result would be a dataframe with only columns for questions starting with q2_ (so, in the example that would be q2_1 and q2_2; in reality there's about 20 for this question), but with the NAs replaced for the answer options from the corresponding q5_, q8_, and q_9.
# desired end result
q2_1 q2_2
1 1 1
2 1 2
3 1 1
4 2 2
5 1 1
6 2 2
7 1 1
8 2 2
For single questions, i've done this using the code below, but this is very manual and because q2, q5, q8, and q9 both go up to _20, I'm looking for a way to automate this more.
# example single question
library(tidyverse)
df <- df %>%
mutate(q2_1 = case_when(!is.na(q2_1) ~ q2_1,
!is.na(q5_1) ~ q5_1,
!is.na(q8_1) ~ q8_1,
!is.na(q9_1) ~ q9_1))
I hope I explained myself well enough and looking forward for some directions!
Here's one way, using coalesce:
df %>%
mutate(q2_1 = do.call(coalesce, across(ends_with('_1'))),
q2_2 = do.call(coalesce, across(ends_with('_2')))) %>%
select(q2_1, q2_2)
#> q2_1 q2_2
#> 1 1 1
#> 2 2 2
#> 3 1 1
#> 4 2 2
#> 5 1 1
#> 6 2 2
#> 7 1 1
#> 8 2 2
q2_1 <- c(NA, NA, NA, NA, NA, NA, rep(c(1:2), 1))
q5_1 <- c(NA, NA, NA, NA, rep(c(1:2), 1), NA, NA)
q8_1 <- c(NA, NA, rep(c(1:2), 1), NA, NA, NA, NA)
q9_1 <- c(rep(c(1:2), 1), NA, NA, NA, NA, NA, NA)
q2_2 <- c(NA, NA, NA, NA, NA, NA, rep(c(1:2), 1))
q5_2 <- c(NA, NA, NA, NA, rep(c(1:2), 1), NA, NA)
q8_2 <- c(NA, NA, rep(c(1:2), 1), NA, NA, NA, NA)
q9_2 <- c(rep(c(1:2), 1), NA, NA, NA, NA, NA, NA)
df <- data.frame(q2_1, q5_1, q8_1, q9_1, q2_2, q5_2, q8_2, q9_2)
df
#> q2_1 q5_1 q8_1 q9_1 q2_2 q5_2 q8_2 q9_2
#> 1 NA NA NA 1 NA NA NA 1
#> 2 NA NA NA 2 NA NA NA 2
#> 3 NA NA 1 NA NA NA 1 NA
#> 4 NA NA 2 NA NA NA 2 NA
#> 5 NA 1 NA NA NA 1 NA NA
#> 6 NA 2 NA NA NA 2 NA NA
#> 7 1 NA NA NA 1 NA NA NA
#> 8 2 NA NA NA 2 NA NA NA
library(tidyverse)
suffix <- str_c("_", 1:2)
map_dfc(.x = suffix,
.f = ~ transmute(df, !!str_c("q2", .x) := rowSums(across(ends_with(.x
)), na.rm = T)))
#> q2_1 q2_2
#> 1 1 1
#> 2 2 2
#> 3 1 1
#> 4 2 2
#> 5 1 1
#> 6 2 2
#> 7 1 1
#> 8 2 2
Created on 2022-04-04 by the reprex package (v2.0.1)

Mutate and ifelse with NA and values from other variable in R [duplicate]

This question already has answers here:
R: coalescing a large data frame
(2 answers)
How to implement coalesce efficiently in R
(9 answers)
Closed 2 years ago.
I have a df that looks something like this:
id <- c(1:8)
born.swis <- c(0, 1, NA, NA, NA, 2, NA, NA)
born2005 <- c(NA, NA, 2, NA, NA, NA, NA, NA)
born2006 <- c(NA, NA, NA, 1, NA, NA, NA, NA)
born2007 <- c(NA, NA, NA, NA, NA, NA, NA, 1)
born2008 <- c(NA, NA, NA, NA, NA, NA, 2, NA)
born2009 <- c(NA, NA, NA, NA, NA, NA, NA, NA)
df <- data.frame(id, born.swis, born2005, born2006, born2007, born2008, born2009)
I'm trying to mutate born.swis based on the values of the other variables. Basically, I want the value bornswis to be filled with the value one of the other variables IF born.id is NA and IF it is not NA for that variable. Something like this:
id <- c(1:8)
born.swis <- c(0, 1, 2, 1, NA, 2, 2,1)
df.desired <- data.frame(id, born.swis)
I tried several things with mutate and ifelse, like this:
df <- df%>%
mutate(born.swis = ifelse(is.na(born.swis), born2005, NA,
ifelse(is.na(born.swis), born2006, NA,
ifelse(is.na(born.swis), born2007, NA,
ifelse(is.na(born.swis), born2008, NA,
ifelse(is.na(born.swis), born2009, NA,)
)))))
and similar things, but I'm not able to reach my desired outcome.
Any ideas?
Many thanks!
One dplyr option could be:
df %>%
mutate(born.swis_res = coalesce(!!!select(., starts_with("born"))))
id born.swis born2005 born2006 born2007 born2008 born2009 born.swis_res
1 1 0 NA NA NA NA NA 0
2 2 1 NA NA NA NA NA 1
3 3 NA 2 NA NA NA NA 2
4 4 NA NA 1 NA NA NA 1
5 5 NA NA NA NA NA NA NA
6 6 2 NA NA NA NA NA 2
7 7 NA NA NA NA 2 NA 2
8 8 NA NA NA 1 NA NA 1
Or with dplyr 1.0.0:
df %>%
mutate(born.swis_res = Reduce(coalesce, across(starts_with("born"))))
In base R, you can use max.col :
df[cbind(1:nrow(df), max.col(!is.na(df[-1])) + 1 )]
#[1] 0 1 2 1 NA 2 2 1
max.col gives the column position of the first non-NA value in each row (exlcuding first column), we create a matrix with row-index and use it to subset df.
base R
df$born.swis <- apply(df[-1], 1, function(x) ifelse(all(is.na(x)), NA, sum(x, na.rm = T)))

If column A has a value, summarise variables in column B, until next variable in column A appears

I have performed an experiment in which people are moving around cubes until they made a figure they like. When they like a figure, they save if and creates a new one. The script tracked time and number of moves between all figures safes.
I now have a column (A) with number of moves between each save and a column (B) with the time between each move until the figure is saved. Thus column A is filled with NA's and then a number (signifies figure safe) and column B has time in seconds in all rows (except from first row) signifing all the moves made.
Excerpt of data:
A B C
NA 1.6667798
NA 3.3326443
NA 3.5506110
NA 11.4995562
NA 1.4334849
NA 4.9502637
NA 2.1161980
NA 4.7833326
NA 2.8500842
NA 4.0331373
NA 4.3498785
12 5.0910905 Sum
NA 4.2424078
NA 1.7332665
NA 1.5341006
3 4.8923275 Sum
NA 4.1064621
NA 3.3498289
NA 1.6002373
3 6.0122170 Sum
I have tried several loop options, but I cannot seem make it work properly.
I made this loop, but it is not doing the correct calculation in column C.
data$C <- rep(NA, nrow(data))
for (i in unique(data$id)) {
C <- which(data$id == i & data$type == "moveblock")
for (e in 1:length(C)){
if (e == 1){
data$C[C[e]] = C[e] - which(data$id == i)[1]
}
else if (e > 1){
data$C[C[e]] = C[e] + C[e+1]+1}
}
d_times <- which(data$id == i)
for (t in 2:length(d_times)){
data$B[d_times[t]] <- data$time[d_times[t]] - data$time[d_times[t-1]]
}
}
I want a new column (C) which has the sum of all rows from column B until a figure has been saved = a number in column A. In other words, I want to calculate the total time it took the subject to make all the moves before saving the figure.
Hope anyone can figure this out!
We can create groups based on occurrence of non NA values and take sum
library(dplyr)
df %>%
group_by(group = lag(cumsum(!is.na(A)), default = 0)) %>%
summarise(sum = sum(B, na.rm = TRUE))
# group sum
# <dbl> <dbl>
#1 0 49.7
#2 1 12.4
#3 2 15.1
In base R, we can use aggregate to do the same
aggregate(B~c(0, head(cumsum(!is.na(A)), -1)), df, sum, na.rm = TRUE)
data
df <- structure(list(A = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 12L, NA, NA, NA, 3L, NA, NA, NA, 3L), B = c(1.6667798, 3.3326443,
3.550611, 11.4995562, 1.4334849, 4.9502637, 2.116198, 4.7833326,
2.8500842, 4.0331373, 4.3498785, 5.0910905, 4.2424078, 1.7332665,
1.5341006, 4.8923275, 4.1064621, 3.3498289, 1.6002373, 6.012217
)), class = "data.frame", row.names = c(NA, -20L))
You could create a matrix out of the periods (i.e. sequences) and sum the values of column B accordingly. For this create vector saved that indicates where a subject has "saved" and list sequences using apply(). Finally the sapply() loops over the sequences in the periods list.
saved <- which(!is.na(dat$A))
periods <- apply(cbind(c(1, (saved + 1)[-3]), saved), 1, function(x) seq(x[1], x[2]))
dat$C[saved] <- sapply(periods, function(x) sum(dat$B[x]))
Result
dat$C
# [1] NA NA NA NA NA NA NA NA NA
# [10] NA NA 49.65706 NA NA NA 12.40210 NA NA
# [19] NA 15.06875
Data
dat <- structure(list(A = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 12L, NA, NA, NA, 3L, NA, NA, NA, 3L), B = c(1.6667798, 3.3326443,
3.550611, 11.4995562, 1.4334849, 4.9502637, 2.116198, 4.7833326,
2.8500842, 4.0331373, 4.3498785, 5.0910905, 4.2424078, 1.7332665,
1.5341006, 4.8923275, 4.1064621, 3.3498289, 1.6002373, 6.012217
), C = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA, -20L), class = "data.frame")

fixing a messy dataframe with tidyr in R

I have a dataset with observations about households; within each household there are individuals. The number of individuals per household differs. Households are identified with an id and members of the household are identified according to the order they were interviewed. So if household 1 had 4 members, the variable id is the same across all of them, but variable order goes from 1 to 4. The problem I have is that, for some variables, only the first member of the household answered for the rest of the members; therefore I have a mixture of long and wide format within my dataset.
What I need to do is to assign to the correspondent members of the household the values that were answered by the first member of the household. To explain further the structure of my data I´ll give the following toy example:
df <- data.frame( id = c(rep(1,4), rep(2,5)), order = c(1:4,1:5),
age = c(54,20,23,17, 60,57,28,33,19),
educDebt1 = c(1, NA, NA, NA, 3, NA, NA, NA, NA),
educDebt2 = c(3, NA, NA, NA, 5, NA, NA, NA, NA),
educDebt3 = c(NA, NA, NA, NA, 4, NA, NA, NA, NA),
educDebt1t = c("student loan", NA,NA,NA,
"student loan", NA, NA, NA, NA),
educDebt2t = c("student fund", NA, NA, NA,
"bank credit", NA, NA, NA, NA),
educdebt3t = c(NA, NA, NA, NA,
"bank credit", NA, NA, NA, NA),
educDebt1t_r = c("yes", NA,NA,NA, "no",NA,NA,NA,NA),
educDebt2t_r = c("no", NA, NA, NA, "no", NA,NA,NA,NA),
educDebt3t_r = c(NA,NA,NA,NA, "yes", NA,NA,NA,NA),
bankDebt1 = c(1, NA, NA, NA, 3, NA, NA, NA, NA),
bankDebt2 = c(4, NA, NA, NA, 2, NA, NA, NA, NA),
bankDebt3 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA),
bankDebt1t = c("car loan", NA,NA,NA,
"consumer loan", NA, NA, NA, NA),
bankDebt2t = c("car loan", NA, NA, NA,
"car loan", NA, NA, NA, NA),
bankdebt3t = c(NA, NA, NA, NA, NA, NA, NA, NA, NA),
bankDebt1t_r = c("yes", NA,NA,NA, "yes",NA,NA,NA,NA),
bankDebt2t_r = c("no", NA, NA, NA, "no", NA,NA,NA,NA),
bankDebt3t_r = c(NA,NA,NA,NA, NA, NA,NA,NA,NA))
I only show some of the columns, for not cluttering the page.
id order age educDebt1 educDebt2 educDebt3 educDebt1t educDebt2t educdebt3A
1 1 54 1 3 NA student loan student fund NA
1 2 20 NA NA NA NA NA NA
1 3 23 NA NA NA NA NA NA
1 4 17 NA NA NA NA NA NA
2 1 60 3 5 4 student loan bank credit bank credit
2 2 57 NA NA NA NA NA NA
2 3 28 NA NA NA NA NA NA
2 4 33 NA NA NA NA NA NA
2 5 19 NA NA NA NA NA NA
In the toy example from above, I have a household level variable id and individual level variables: order corresponds to the order of the individual in the household; age is their age. The other variables correspond to debts. A household can report at most three debts for each type of debt. In this case there are two types of debt, educational debt educDebt or bank debt bankdebt(only one type is shown above).
So in each household, only the member corresponding to order == 1 answer for the rest of the members in the household. In educDebt1 till educDebt3, the value corresponds to the member of the household with the debt, therefore, if we take a look at the first row, it says that household member 1 of household 1 has an educational debt, as well as household member 3. Then, from educDebt1t to educDebt3t, it tells which type of debt the household member has. In household 2, three are the members with debts, household members: 3, 5 and 4.
Then we have another type of debt, bank debt, and the logic is the same as before.
What I want to accomplish, is to have every member of the household and their debts in a row, something like this:
id order age educDebt educDebt_r bankDebt bankDebt_r
1 1 54 student loan yes car loan yes
1 2 20 NA NA NA NA
1 3 23 student fund no NA NA
1 4 17 NA NA car loan no
2 1 60 NA NA NA NA
2 2 57 NA NA car loan no
2 3 28 student loan no consumer loan yes
2 4 33 bank credit yes NA NA
2 5 19 bank credit no NA NA
For accomplishing this I actually divided the data in different tables, one with the first three variables, and others for each type of debt. For the debt tables I only kept the row of the interviewed member, and reshape the data to long format so each row became a household member, and then I merged the tables by household and household member id, but there are many debt types, and my aproach is quite inefficient. Is there a way I could achieve the same result with the tidyr package?
My approach was the following:
First, I created three data frames, that extracted different column indexes for each row. I did it with a for loop.
newdf1 <- data.frame()
ind <- c(1,seq(4,19, 3))
for(j in 1:nrow(df)){
fila <- c()
for(i in 1:length(ind)){
dato <- as.character(df[j,ind[i]])
fila <- c(fila, dato)
}
newdf1 <- rbind(newdf1, fila, stringsAsFactors = FALSE )
}
newdf2 <- data.frame()
ind <- c(1,seq(5,20, 3))
for(j in 1:nrow(df)){
fila <- c()
for(i in 1:length(ind)){
dato <- as.character(df[j,ind[i]])
fila <- c(fila, dato)
}
newdf2 <- rbind(newdf2, fila, stringsAsFactors = FALSE )
}
newdf3 <- data.frame()
ind <- c(1,seq(6,21, 3))
for(j in 1:nrow(df)){
fila <- c()
for(i in 1:length(ind)){
dato <- as.character(df[j,ind[i]])
fila <- c(fila, dato)
}
newdf3 <- rbind(newdf3, fila, stringsAsFactors = FALSE )
}
Then I rowbinded them:
NewDfs <- rbind(newdf1,setNames(newdf2, names(newdf1)),
setNames(newdf3, names(newdf1)))
names(NewDfs ) <- c("id", "order", "educDebt", "educDebt_r",
"order", "bankDebt", "bankDebt_r")
From this dataframe, I extracted the debts regarding education in one dataframe, and the debts regarding bank in another, keep only the compelte cases, and merge them together by id and order.
educ <- NewDfs [,c(1:4)]
bank <- NewDfs [,c(1,5:7)]
educ <- educ[complete.cases(educ), ]
bank <- bank[complete.cases(bank), ]
I also created a datarame with the first three columns of the original dataset.
df_household <- df[,1:3]
And merged it with the educ_bank data frame.
dfMerged <- merge(df_hog, educ_bank, by = c("id", "order"), all.x = TRUE)
id order age educDebt educDebt_r bankDebt bankDebt_r
1 1 54 student loan yes car loan yes
1 2 20 <NA> <NA> <NA> <NA>
1 3 23 student fund no <NA> <NA>
1 4 17 <NA> <NA> car loan no
2 1 60 <NA> <NA> <NA> <NA>
2 2 57 <NA> <NA> car loan no
2 3 28 student loan no consumer loan yes
2 4 33 bank credit yes <NA> <NA>
2 5 19 bank credit no <NA> <NA>
Evidently, this doen´t seem to be the most straightforward way of doing it, and I was wondering if there was a simplier way of achieving the same with tidyr.
I don't have a solution that is completely tidyr (and dplyr), though perhaps somebody more familiar with it can assist. (There is room to include more of the tidyverse, specifically purrr, to replace some of the base R code, but I thought it unnecessary.) I'll walk through each step with the solution at the bottom.
Data
First, I think some of the columns are misnamed (lower-case "debt"), so I fixed it; that's not absolutely critical, but it makes some things much easier. I also disable factors, as some operations (on debt, below) require strings. If having factors is important, I suggest you re-factor after this process.
df <- data.frame(
id = c(rep(1,4), rep(2,5)), order = c(1:4,1:5),
age = c(54,20,23,17, 60,57,28,33,19),
educDebt1 = c(1, NA, NA, NA, 3, NA, NA, NA, NA),
educDebt2 = c(3, NA, NA, NA, 5, NA, NA, NA, NA),
educDebt3 = c(NA, NA, NA, NA, 4, NA, NA, NA, NA),
educDebt1t = c("student loan", NA,NA,NA, "student loan", NA, NA, NA, NA),
educDebt2t = c("student fund", NA, NA, NA, "bank credit", NA, NA, NA, NA),
educDebt3t = c(NA, NA, NA, NA, "bank credit", NA, NA, NA, NA),
educDebt1t_r = c("yes", NA,NA,NA, "no",NA,NA,NA,NA),
educDebt2t_r = c("no", NA, NA, NA, "no", NA,NA,NA,NA),
educDebt3t_r = c(NA,NA,NA,NA, "yes", NA,NA,NA,NA),
bankDebt1 = c(1, NA, NA, NA, 3, NA, NA, NA, NA),
bankDebt2 = c(4, NA, NA, NA, 2, NA, NA, NA, NA),
bankDebt3 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA),
bankDebt1t = c("car loan", NA,NA,NA, "consumer loan", NA, NA, NA, NA),
bankDebt2t = c("car loan", NA, NA, NA, "car loan", NA, NA, NA, NA),
bankDebt3t = c(NA, NA, NA, NA, NA, NA, NA, NA, NA),
bankDebt1t_r = c("yes", NA,NA,NA, "yes",NA,NA,NA,NA),
bankDebt2t_r = c("no", NA, NA, NA, "no", NA,NA,NA,NA),
bankDebt3t_r = c(NA,NA,NA,NA, NA, NA,NA,NA,NA),
stringsAsFactors = FALSE
)
library(dplyr)
library(tidyr)
Step-through
Eventually, we're going to merge in age, and since all respondents are identified by both id and order, so we separate the three:
maintbl <- select(df, id, order, age)
The first thing to realize (for me) is that you need to convert from wide-to-tall, but individually for each three column group. I'll start with the first bunch of three:
grp <- "educDebt"
select(df, id, matches(paste0(grp, "[0-9]+$"))) %>%
gather(debt, order, -id) %>%
filter(! is.na(order)) %>%
arrange(id, order)
# id debt order
# 1 1 educDebt1 1
# 2 1 educDebt2 3
# 3 2 educDebt1 3
# 4 2 educDebt3 4
# 5 2 educDebt2 5
(BTW 1: the reason I'm using grp will be apparent later.)
(BTW 2: I used the regex [0-9]+ to match one or more digit, in case this is expanded to include either more than 9 or "arbitrary" numbering. Feel free to omit the +.)
This seems fine. We now need to cbind the *t variant of these three:
select(df, id, matches(paste0(grp, "[0-9]+t$"))) %>%
gather(debt, type, -id) %>%
filter(! is.na(type)) %>%
mutate(debt = gsub("t$", "", debt))
# id debt type
# 1 1 educDebt1 student loan
# 2 2 educDebt1 student loan
# 3 1 educDebt2 student fund
# 4 2 educDebt2 bank credit
# 5 2 educDebt3 bank credit
I changed debt to remove the trailing t, as I'm going to use that as a merging column later. I do the same thing for the third group of three (for "educDebt"), the t_r columns.
These three columns need to be combined, so here I place them in a list and Reduce them:
Reduce(function(x,y) left_join(x, y, by = c("id", "debt")),
list(
select(df, id, matches(paste0(grp, "[0-9]+$"))) %>%
gather(debt, order, -id) %>%
filter(! is.na(order)) %>%
arrange(id, order),
select(df, id, matches(paste0(grp, "[0-9]+t$"))) %>%
gather(debt, type, -id) %>%
filter(! is.na(type)) %>%
mutate(debt = gsub("t$", "", debt)),
select(df, id, matches(paste0(grp, "[0-9]+t_r$"))) %>%
gather(debt, r, -id) %>%
filter(! is.na(r)) %>%
mutate(debt = gsub("t_r$", "", debt))
))
# id debt order type r
# 1 1 educDebt1 1 student loan yes
# 2 1 educDebt2 3 student fund no
# 3 2 educDebt1 3 student loan no
# 4 2 educDebt3 4 bank credit yes
# 5 2 educDebt2 5 bank credit no
I'll need to rename the last two columns, and since I'm done combining the type and r columns, I can drop debt. (I'd normally suggest dplyr::rename_, but since it is being deprecated shortly, I'm doing it manually. If you have significantly more columns than shown here, you may need to adjust the column numbering, etc.)
Lastly, we need to do this for each of "educDebt" and "bankDebt", join these by id and order (using another Reduce), and finally re-merge in the age.
TL;DR
Reduce(function(x,y) left_join(x, y, by = c("id", "order")),
lapply(c("educDebt", "bankDebt"), function(grp) {
ret <- Reduce(function(x,y) left_join(x, y, by = c("id", "debt")),
list(
select(df, id, matches(paste0(grp, "[0-9]+$"))) %>%
gather(debt, order, -id) %>%
filter(! is.na(order)) %>%
arrange(id, order),
select(df, id, matches(paste0(grp, "[0-9]+t$"))) %>%
gather(debt, type, -id) %>%
filter(! is.na(type)) %>%
mutate(debt = gsub("t$", "", debt)),
select(df, id, matches(paste0(grp, "[0-9]+t_r$"))) %>%
gather(debt, r, -id) %>%
filter(! is.na(r)) %>%
mutate(debt = gsub("t_r$", "", debt))
))
names(ret)[4:5] <- c(grp, paste0(grp, "_r"))
select(ret, -debt)
})
) %>%
left_join(maintbl, ., by = c("id", "order"))
# id order age educDebt educDebt_r bankDebt bankDebt_r
# 1 1 1 54 student loan yes car loan yes
# 2 1 2 20 <NA> <NA> <NA> <NA>
# 3 1 3 23 student fund no <NA> <NA>
# 4 1 4 17 <NA> <NA> <NA> <NA>
# 5 2 1 60 <NA> <NA> <NA> <NA>
# 6 2 2 57 <NA> <NA> <NA> <NA>
# 7 2 3 28 student loan no consumer loan yes
# 8 2 4 33 bank credit yes <NA> <NA>
# 9 2 5 19 bank credit no <NA> <NA>

Resources