Related
Below are my two dataframes, df1 and df2
df1 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420"),text=c("asdf","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten"),response=c("y","y","y","n","n","y","y","n","n","y"))
id text response
1 632592651 asdf y
2 633322173 cat y
3 634703802 dog y
4 634927873 mouse n
5 635812953 elephant n
6 636004739 goose y
7 636101211 rat y
8 636157799 mice n
9 636263106 kitty n
10 636752420 kitten y
df2 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420","636809222","2004722036","2004894388","2005045755","2005535472","2005630542","2005788781","2005809679","2005838317","2005866692"),
text=c("asdf_xyz","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten","tiger_xyz","lion","leopard","ostrich","kangaroo","platypus","fish","reptile","mammals","amphibians_xyz"),
volume=c("1234","432","324","333","2223","412346","7456","3456","2345","2345","6","345","23","2","4778","234","8675","3459","8","9"))
id text volume
1 632592651 asdf_xyz 1234
2 633322173 cat 432
3 634703802 dog 324
4 634927873 mouse 333
5 635812953 elephant 2223
6 636004739 goose 412346
7 636101211 rat 7456
8 636157799 mice 3456
9 636263106 kitty 2345
10 636752420 kitten 2345
11 636809222 tiger_xyz 6
12 2004722036 lion 345
13 2004894388 leopard 23
14 2005045755 ostrich 2
15 2005535472 kangaroo 4778
16 2005630542 platypus 234
17 2005788781 fish 8675
18 2005809679 reptile 3459
19 2005838317 mammals 8
20 2005866692 amphibians_xyz 9
How do I change the non-matching items from row id1:20 of df2 to NA (i.e. all of them as no matching with df1) and the column 'text' (i.e. asdf_xyz) of id1 to NA?
I have tried
library(dplyr)
df3 <- df2 %>%
anti_join(df1, by=c("id"))
id text volume
1 636809222 tiger_xyz 6
2 2004722036 lion 345
3 2004894388 leopard 23
4 2005045755 ostrich 2
5 2005535472 kangaroo 4778
6 2005630542 platypus 234
7 2005788781 fish 8675
8 2005809679 reptile 3459
9 2005838317 mammals 8
10 2005866692 amphibians_xyz 9
df3$id[df3$id != 0] <- NA
df3$text[df3$text != 0] <- NA
df3$volume[df3$volume != 0] <- NA
(Doing this one by one because I couldn't find solution how to change the entire value of the dataframe to NA)
id text volume
1 <NA> <NA> <NA>
2 <NA> <NA> <NA>
3 <NA> <NA> <NA>
4 <NA> <NA> <NA>
5 <NA> <NA> <NA>
6 <NA> <NA> <NA>
7 <NA> <NA> <NA>
8 <NA> <NA> <NA>
9 <NA> <NA> <NA>
10 <NA> <NA> <NA>
and df4 (solution from How to return row values that match column 'id' in both df1 and df2 but not column 'text' and return NA to the mismatch in column 'text'?)
inner_join(x = df1,
y = df2,
by = "id") %>%
mutate_if(is.factor, as.character) %>%
mutate(text = ifelse(test = text.x != text.y,
yes = NA,
no = text.x)) %>%
select(id, text, response, volume)
id text response volume
1 632592651 <NA> y 1234
2 633322173 cat y 432
3 634703802 dog y 324
4 634927873 mouse n 333
5 635812953 elephant n 2223
6 636004739 goose y 412346
7 636101211 rat y 7456
8 636157799 mice n 3456
9 636263106 kitty n 2345
10 636752420 kitten y 2345
but not sure how to replace df2 with df3 and df4. The desired output is shown below:
id text volume
1 632592651 NA 1234
2 633322173 cat 432
3 634703802 dog 324
4 634927873 mouse 333
5 635812953 elephant 2223
6 636004739 goose 412346
7 636101211 rat 7456
8 636157799 mice 3456
9 636263106 kitty 2345
10 636752420 kitten 2345
11 NA NA NA
12 NA NA NA
13 NA NA NA
14 NA NA NA
15 NA NA NA
16 NA NA NA
17 NA NA NA
18 NA NA NA
19 NA NA NA
20 NA NA NA
Can someone help please?
If possible, may I also know if there's a manual approach to select subset of df2 based on df3$id and change all values to NA?
Part 2:
For the second part of my request, I would like to create another dataframes from joined_df which appears only in df1 (call it found_in_df1). Example of output:
found_in_df1:
# id text volume
# 1: 632592651 <NA> 1234
# 2: 633322173 cat 432
# 3: 634703802 dog 324
# 4: 634927873 mouse 333
# 5: 635812953 elephant 2223
# 6: 636004739 goose 412346
# 7: 636101211 rat 7456
# 8: 636157799 mice 3456
# 9: 636263106 kitty 2345
#10: 636752420 kitten 2345
The solution is given in How to return row values that match column 'id' in both df1 and df2 but not column 'text' and return NA to the mismatch in column 'text'? but I'm looking for an alternative approach, i.e., is it possible to write a script to say retrieve from joined_df using df1 to give found_in_df1 since we have df1 and joined_df?
One potential solution for dealing with conflicts is to use the powerjoin package, e.g.
library(dplyr)
df1 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420"),
text=c("asdf","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten"),
response=c("y","y","y","n","n","y","y","n","n","y"))
df2 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420","636809222","2004722036","2004894388","2005045755","2005535472","2005630542","2005788781","2005809679","2005838317","2005866692"),
text=c("asdf_xyz","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten","tiger_xyz","lion","leopard","ostrich","kangaroo","platypus","fish","reptile","mammals","amphibians_xyz"),
volume=c(1234,432,324,333,2223,412346,7456,3456,2345,2345,6,345,23,2,4778,234,8675,3459,8,9))
expected_outcome <- data.frame(id = c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
text = c(NA, "cat", "dog", "mouse", "elephant", "goose",
"rat", "mice", "kitty", "kitten",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
volume = c(1234, 432, 324, 333, 2223, 412346, 7456,
3456, 2345, 2345, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA))
library(powerjoin)
joined_df <- power_full_join(df1, df2, by = c("id"),
conflict = rw ~ ifelse(.x != .y,
NA_integer_,
.x))
final_df <- joined_df %>%
mutate(across(everything(), ~ifelse(is.na(response), NA, .x))) %>%
select(id, text, volume)
final_df
#> id text volume
#> 1 632592651 <NA> 1234
#> 2 633322173 cat 432
#> 3 634703802 dog 324
#> 4 634927873 mouse 333
#> 5 635812953 elephant 2223
#> 6 636004739 goose 412346
#> 7 636101211 rat 7456
#> 8 636157799 mice 3456
#> 9 636263106 kitty 2345
#> 10 636752420 kitten 2345
#> 11 <NA> <NA> NA
#> 12 <NA> <NA> NA
#> 13 <NA> <NA> NA
#> 14 <NA> <NA> NA
#> 15 <NA> <NA> NA
#> 16 <NA> <NA> NA
#> 17 <NA> <NA> NA
#> 18 <NA> <NA> NA
#> 19 <NA> <NA> NA
#> 20 <NA> <NA> NA
all_equal(final_df, expected_outcome)
#> [1] TRUE
# Part 2
found_in_df1 <- power_left_join(df1, df2, by = c("id"),
conflict = rw ~ ifelse(.x != .y,
NA_integer_,
.x)) %>%
select(id, text, volume)
found_in_df1
#> id text volume
#> 1 632592651 <NA> 1234
#> 2 633322173 cat 432
#> 3 634703802 dog 324
#> 4 634927873 mouse 333
#> 5 635812953 elephant 2223
#> 6 636004739 goose 412346
#> 7 636101211 rat 7456
#> 8 636157799 mice 3456
#> 9 636263106 kitty 2345
#> 10 636752420 kitten 2345
Created on 2022-07-02 by the reprex package (v2.0.1)
Edit
Per the comment below from the creator of the powerjoin package (Mr. Mudskipper): these operations are vectorised, so you don't need to perform the command 'rowwise', i.e. you can remove "rw" to simplify and gain performance. There is no practical difference between including and excluding "rw" with df1 and df2, but if we use larger dataframes you can see a clear increase in performance, e.g.
library(dplyr)
library(powerjoin)
# define functions
power_full_join_func_rowwise <- function(df1, df2) {
joined_df <- power_full_join(df1, df2, by = c("id"),
conflict = rw ~ ifelse(.x != .y,
NA_integer_,
.x))
final_df <- joined_df %>%
mutate(across(everything(), ~ifelse(is.na(response), NA, .x))) %>%
select(id, text, volume)
return(final_df)
}
power_full_join_func_not_rowwise <- function(df1, df2) {
joined_df <- power_full_join(df1, df2, by = c("id"),
conflict = ~ifelse(.x != .y,
NA_integer_,
.x))
final_df <- joined_df %>%
mutate(across(everything(), ~ifelse(is.na(response), NA, .x))) %>%
select(id, text, volume)
return(final_df)
}
library(microbenchmark)
library(purrr)
library(ggplot2)
# make larger dfs (copy df1 and df2 X100)
df3 <- map_dfr(seq_len(100), ~ df1)
df4 <- map_dfr(seq_len(100), ~ df2)
# benchmark performance on the larger dataframes
res <- microbenchmark(power_full_join_func_rowwise(df3, df4),
power_full_join_func_not_rowwise(df3, df4))
res
#> Unit: milliseconds
#> expr min lq mean
#> power_full_join_func_rowwise(df3, df4) 397.32661 426.08117 449.88787
#> power_full_join_func_not_rowwise(df3, df4) 71.85757 77.25344 90.36191
#> median uq max neval cld
#> 446.41715 472.47817 587.3301 100 b
#> 81.18239 93.95103 191.1248 100 a
autoplot(res)
#> Coordinate system already present. Adding new coordinate system, which will replace the existing one.
# Is the result the same?
all_equal(power_full_join_func_rowwise(df3, df4),
power_full_join_func_not_rowwise(df3, df4))
#> [1] TRUE
Created on 2022-11-24 by the reprex package (v2.0.1)
data.table version using an !antijoin, and overwriting := all columns/rows returned in df2 with an NA (recycled list .(NA) to all columns).
Then looping over all the common variables and overwriting any values which don't match by id:
library(data.table)
setDT(df1)
setDT(df2)
df2[!df1, on=.(id), names(df2) := .(NA)]
idvars <- "id"
compvars <- setdiff(intersect(names(df1), names(df2)), idvars)
for (i in compvars) {
df2[!df1, on=c(idvars,i), (i) := NA]
}
# id text volume
# 1: 632592651 <NA> 1234
# 2: 633322173 cat 432
# 3: 634703802 dog 324
# 4: 634927873 mouse 333
# 5: 635812953 elephant 2223
# 6: 636004739 goose 412346
# 7: 636101211 rat 7456
# 8: 636157799 mice 3456
# 9: 636263106 kitty 2345
#10: 636752420 kitten 2345
#11: <NA> <NA> <NA>
#12: <NA> <NA> <NA>
#13: <NA> <NA> <NA>
#14: <NA> <NA> <NA>
#15: <NA> <NA> <NA>
#16: <NA> <NA> <NA>
#17: <NA> <NA> <NA>
#18: <NA> <NA> <NA>
#19: <NA> <NA> <NA>
#20: <NA> <NA> <NA>
I want to create a column - C - in dfABy with the name of the existing variables, when in the list A or B it is a "non NA" value. For example, my df is:
>dfABy
A B
56 NA
NA 45
NA 77
67 NA
NA 65
The result what I will attend is:
> dfABy
A B C
56 NA A
NA 45 B
NA 77 B
67 NA A
NA 65 B
One option using dplyr could be:
df %>%
rowwise() %>%
mutate(C = names(.[!is.na(c_across(everything()))]))
A B C
<int> <int> <chr>
1 56 NA A
2 NA 45 B
3 NA 77 B
4 67 NA A
5 NA 65 B
Or with the addition of purrr:
df %>%
mutate(C = pmap_chr(across(A:B), ~ names(c(...)[!is.na(c(...))])))
You can use max.col over is.na values to get the column numbers where non-NA value is present. From those numbers you can get the column names.
dfABy$C <- names(dfABy)[max.col(!is.na(dfABy))]
dfABy
# A B C
#1 56 NA A
#2 NA 45 B
#3 NA 77 B
#4 67 NA A
#5 NA 65 B
If there are more than one non-NA value in a row take a look at at ties.method argument in ?max.col on how to handle ties.
data
dfABy <- structure(list(A = c(56L, NA, NA, 67L, NA), B = c(NA, 45L, 77L,
NA, 65L)), class = "data.frame", row.names = c(NA, -5L))
Using the data.table package I recommend:
dfABy[, C := apply(cbind(dfABy), 1, function(x) names(x[!is.na(x)]))]
creating the following output:
A B C
1 56 NA A
2 NA 45 B
3 NA 77 B
4 67 NA A
5 NA 65 B
This is just another solution, However other proposed solutions are better.
library(dplyr)
library(purrr)
df %>%
rowwise() %>%
mutate(C = detect_index(c(A, B), ~ !is.na(.x)),
C = names(.[C]))
# A tibble: 5 x 3
# Rowwise:
A B C
<dbl> <dbl> <chr>
1 56 NA A
2 NA 45 B
3 NA 77 B
4 67 NA A
5 NA 65 B
I have two df to confrontate. my first df is "sum"
> head(sum)
File_pdb Res1 Chain1 Res2 Chain2
1: 7LD1_CM GLN 81 M ASN 501 C
2: 7LD1_CM TYR 128 M PHE 377 C
3: 7LD1_CM ILE 78 M SER 375 C
4: 7LD1_CM ASN 76 M ALA 372 C
5: 7LD1_CM THR 20 M TYR 369 C
6: 7LD1_CM ARG 408 C LEU 131 M
The second one is "mut"
> head(mut)
RefAA MutAA LineagesCount
1 VAL 3 GLY 3 1
2 LEU 5 PHE 5 2
3 LEU 8 VAL 8 1
4 SER 13 ILE 13 2
5 LEU 18 PHE 18 5
6 THR 20 ILE 20 1
I have to check if in sum$res1 and sum$res2 there are values equal to mut$refAA. If it's so, I need to add the whole row of mut$refAA near to sum$res1 or sum$res2.
here an example:
File_pdb Res1 Chain1 Res2 Chain2 RefAA MutAA LineagesCount
1: 7LD1_CM GLN 81 M ASN 501 C
2: 7LD1_CM TYR 128 M PHE 377 C
3: 7LD1_CM ILE 78 M SER 375 C
4: 7LD1_CM ASN 76 M ALA 372 C
5: 7LD1_CM THR 20 M TYR 369 C THR 20 ILE 20 1
6: 7LD1_CM ARG 408 C LEU 131 M
How I can do this? I was trying something using merge and join functions but I'm not so experienced so I need to practice more. Can someone help me? Thank you!
I had to fix the data a bit, to easily import the data. Then you can try a tidyverse
library(tidyverse)
SUM %>%
mutate(index = 1:n()) %>%
pivot_longer(c(Res1, Res2)) %>%
left_join(mutate(MUT, value=RefAA), by = "value") %>%
group_by(index) %>%
fill(MutAA, RefAA, LineagesCount, .direction = "downup") %>%
ungroup() %>%
pivot_wider(names_from = name, values_from = value, values_fn = toString) %>%
mutate(which_Res = ifelse(RefAA == Res1, "Res1", "Res2"))
# A tibble: 6 x 10
File_pdb Chain1 Chain2 index RefAA MutAA LineagesCount Res1 Res2 which_Res
<chr> <chr> <chr> <int> <chr> <chr> <int> <chr> <chr> <chr>
1 7LD1_CM M C 1 NA NA NA GLN81 ASN501 NA
2 7LD1_CM M C 2 NA NA NA TYR128 PHE377 NA
3 7LD1_CM M C 3 NA NA NA ILE78 SER375 NA
4 7LD1_CM M C 4 NA NA NA ASN76 ALA372 NA
5 7LD1_CM M C 5 THR20 ILE20 1 THR20 TYR369 Res1
6 7LD1_CM C M 6 NA NA NA ARG408 LEU131 NA
The data
SUM <- read.table(text = " File_pdb Res1 Chain1 Res2 Chain2
1: 7LD1_CM GLN81 M ASN501 C
2: 7LD1_CM TYR128 M PHE377 C
3: 7LD1_CM ILE78 M SER375 C
4: 7LD1_CM ASN76 M ALA372 C
5: 7LD1_CM THR20 M TYR369 C
6: 7LD1_CM ARG408 C LEU131 M")
SUM
MUT <- read.table(text = " RefAA MutAA LineagesCount
1 VAL3 GLY3 1
2 LEU5 PHE5 2
3 LEU8 VAL8 1
4 SER13 ILE13 2
5 LEU18 PHE18 5
6 THR20 ILE20 1")
Hope this would help
do.call(
dplyr::coalesce,
lapply(
c("Res1", "Res2"),
function(x) merge(SUM, MUT, by.x = x, by.y = "RefAA", all.x = TRUE)
)
)
which gives
Res1 File_pdb Chain1 Res2 Chain2 MutAA LineagesCount
1 ARG408 7LD1_CM C LEU131 M <NA> NA
2 ASN76 7LD1_CM M ALA372 C <NA> NA
3 GLN81 7LD1_CM M ASN501 C <NA> NA
4 ILE78 7LD1_CM M SER375 C <NA> NA
5 THR20 7LD1_CM M TYR369 C ILE20 1
6 TYR128 7LD1_CM M PHE377 C <NA> NA
Data
> dput(SUM)
structure(list(File_pdb = c("7LD1_CM", "7LD1_CM", "7LD1_CM",
"7LD1_CM", "7LD1_CM", "7LD1_CM"), Res1 = c("GLN81", "TYR128",
"ILE78", "ASN76", "THR20", "ARG408"), Chain1 = c("M", "M", "M",
"M", "M", "C"), Res2 = c("ASN501", "PHE377", "SER375", "ALA372",
"TYR369", "LEU131"), Chain2 = c("C", "C", "C", "C", "C", "M")), class = "data.frame", row.names = c("1:",
"2:", "3:", "4:", "5:", "6:"))
> dput(MUT)
structure(list(RefAA = c("VAL3", "LEU5", "LEU8", "SER13", "LEU18",
"THR20"), MutAA = c("GLY3", "PHE5", "VAL8", "ILE13", "PHE18",
"ILE20"), LineagesCount = c(1L, 2L, 1L, 2L, 5L, 1L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6"))
I have a nested dataframe that I'm trying to unnest. Here's a fake example of the structure.
df <- structure(list(`_id` = c("a", "b", "c", "d"),
variable = list(structure(list(type = c("u", "a", "u", "a", "u", "a", "a"),
m_ = c("m1",
"m2",
"m3",
"m4",
"m5",
"m6", "m7"), #omitted from original example by mistake
t_ = c("2015-07-21 4:13 PM",
"2016-04-21 7:25 PM",
"2017-10-04 9:49 PM",
"2018-12-04 12:29 PM",
"2019-04-20 20:20 AM",
"2016-05-20 12:00 AM",
"2016-06-20 12:00 AM"),
a_ = c(NA,
"",
NA,
"",
NA,
"C",
"C")),
class = "data.frame",
row.names = c(NA, 7L)),
structure(list(type = c("u", "a"),
m_ = c("m1",
"m2"),
t_ = c("2018-05-24 12:08 AM",
"2019-04-24 3:05 PM"),
a_ = c(NA, "")),
class = "data.frame",
row.names = 1:2),
structure(list(type = "u",
m_ = "m1",
t_ = "2018-02-17 3:14 PM"),
class = "data.frame",
row.names = 1L),
structure(list(type = "u",
m_ = "m1",
t_ = "2016-05-27 5:14 PM",
b_ = "b1",
i_ = "i1",
e_ = structure(list(),
.Names = character(0),
class = "data.frame",
row.names = c(NA, -1L)),
l_ = "l1"),
class = "data.frame",
row.names = 1L)),
myDate = structure(c(1521503311.992,
1521514011.161,
1551699584.65,
1553632693.94),
class = c("POSIXct", "POSIXt"))),
row.names = c(1L, 2L, 3L, 4L),
class = "data.frame")
View(df)
variable is a list of dataframes that vary in length (max fields is 7 in this example, but can expand over time).
I tried using the development version of tidyr to take advantage of the new unnest_auto() function.
# devtools::install_github("tidyverse/tidyr")
df2 <- unnest_auto(df, variable)
View(df2)
If I use unnest_longer on the result and specify one column like type I get it to expand.
df3 <- unnest_longer(df2, type)
I don't see any arguments to unnest_longer() that handle multiple columns. Is there a better approach?
Here, since you're unnesting a two dimensional structure (i.e. you want to change both the rows and columns), you can just use unnest:
library(tidyr)
df <- as_tibble(df)
df
#> # A tibble: 4 × 3
#> `_id` variable myDate
#> <chr> <list> <dttm>
#> 1 a <df [7 × 4]> 2018-03-19 18:48:31
#> 2 b <df [2 × 4]> 2018-03-19 21:46:51
#> 3 c <df [1 × 3]> 2019-03-04 05:39:44
#> 4 d <df [1 × 7]> 2019-03-26 15:38:13
df |>
unnest(variable)
#> # A tibble: 11 × 10
#> `_id` type m_ t_ a_ b_ i_ e_ l_ myDate
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <df[> <chr> <dttm>
#> 1 a u m1 2015-07-… <NA> <NA> <NA> <NA> 2018-03-19 18:48:31
#> 2 a a m2 2016-04-… "" <NA> <NA> <NA> 2018-03-19 18:48:31
#> 3 a u m3 2017-10-… <NA> <NA> <NA> <NA> 2018-03-19 18:48:31
#> 4 a a m4 2018-12-… "" <NA> <NA> <NA> 2018-03-19 18:48:31
#> 5 a u m5 2019-04-… <NA> <NA> <NA> <NA> 2018-03-19 18:48:31
#> 6 a a m6 2016-05-… "C" <NA> <NA> <NA> 2018-03-19 18:48:31
#> 7 a a m7 2016-06-… "C" <NA> <NA> <NA> 2018-03-19 18:48:31
#> 8 b u m1 2018-05-… <NA> <NA> <NA> <NA> 2018-03-19 21:46:51
#> 9 b a m2 2019-04-… "" <NA> <NA> <NA> 2018-03-19 21:46:51
#> 10 c u m1 2018-02-… <NA> <NA> <NA> <NA> 2019-03-04 05:39:44
#> 11 d u m1 2016-05-… <NA> b1 i1 l1 2019-03-26 15:38:13
If you did want to do it in two steps, you could take advantage of the fact that unnest_longer() now takes a tidyselect specification:
df |>
unnest_wider(variable) |>
unnest_longer(type:a_)
#> # A tibble: 11 × 10
#> `_id` type m_ t_ a_ b_ i_ e_ l_ myDate
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <df[> <chr> <dttm>
#> 1 a u m1 2015-07-… <NA> <NA> <NA> <NA> 2018-03-19 18:48:31
#> 2 a a m2 2016-04-… "" <NA> <NA> <NA> 2018-03-19 18:48:31
#> 3 a u m3 2017-10-… <NA> <NA> <NA> <NA> 2018-03-19 18:48:31
#> 4 a a m4 2018-12-… "" <NA> <NA> <NA> 2018-03-19 18:48:31
#> 5 a u m5 2019-04-… <NA> <NA> <NA> <NA> 2018-03-19 18:48:31
#> 6 a a m6 2016-05-… "C" <NA> <NA> <NA> 2018-03-19 18:48:31
#> 7 a a m7 2016-06-… "C" <NA> <NA> <NA> 2018-03-19 18:48:31
#> 8 b u m1 2018-05-… <NA> <NA> <NA> <NA> 2018-03-19 21:46:51
#> 9 b a m2 2019-04-… "" <NA> <NA> <NA> 2018-03-19 21:46:51
#> 10 c u m1 2018-02-… <NA> <NA> <NA> <NA> 2019-03-04 05:39:44
#> 11 d u m1 2016-05-… <NA> b1 i1 l1 2019-03-26 15:38:13
This appears to work:
df %>% unnest_auto(variable) %>% unnest()
#Warning message:
#`cols` is now required.
#Please use `cols = c(type, m_, t_, a_, e_)`
df %>% unnest_auto(variable) %>% unnest(cols = c(type, m_, t_, a_, e_, l_))
# A tibble: 11 x 10
`_id` type m_ t_ a_ b_ i_ e_ l_ myDate
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <???> <chr> <dttm>
1 a u m1 2015-… NA NA NA NA NA 2018-03-20 02:48:31
2 a a m2 2016-… "" NA NA NA NA 2018-03-20 02:48:31
3 a u m3 2017-… NA NA NA NA NA 2018-03-20 02:48:31
4 a a m4 2018-… "" NA NA NA NA 2018-03-20 02:48:31
5 a u m5 2019-… NA NA NA NA NA 2018-03-20 02:48:31
6 a a m6 2016-… C NA NA NA NA 2018-03-20 02:48:31
7 a a m7 2016-… C NA NA NA NA 2018-03-20 02:48:31
8 b u m1 2018-… NA NA NA NA NA 2018-03-20 05:46:51
9 b a m2 2019-… "" NA NA NA NA 2018-03-20 05:46:51
10 c u m1 2018-… NA NA NA NA NA 2019-03-04 14:39:44
11 d u m1 2016-… NA b1 i1 NA l1 2019-03-26 23:38:13
I need to create a complicated "for" loop, but after reading some examples I'm still clueless of how to write it in a proper R way and therefore I'm not sure whether it will work or not. I'm still an R beginner :(
I have a dataset in the long format, with different occasions, however, some occasions are not truly new ones since the date of start is the same, but have a different offence that I need to copy in a new column called "offence2", after this I need to drop the false new occasion, in order to keep only rows that represent new occasions. My real data have up to 8 different offences for a single date, but I made a simpler example.
This are an example of how my data looks like
id<-c(1,1,1,2,2,3,3,3,4,4,4,4,5,5,5)
dstart<-c("25/11/2006", "13/12/2006","13/12/2006","07/02/2006","07/02/2006",
"15/01/2006", "22/03/2006","18/09/2006", "04/03/2006","04/03/2006",
"22/08/2006","22/08/2006","11/04/2006", "11/04/2006", "19/10/2006")
dstart1<-as.Date(dstart, "%d/%m/%Y")
offence<-c("a","b","c","b","d","a","a","e","b","a","c","a","a","b","a")
cod_offence<-c(25, 26,27,26,28,25,25,29,26,25,27,25,25,26,25)
mydata<-data.frame(id, dstart1, offence, cod_offence)
Data
id dstart1 offence cod_offence
1 1 2006-11-25 a 25
2 1 2006-12-13 b 26
3 1 2006-12-13 c 27
4 2 2006-02-07 b 26
5 2 2006-02-07 d 28
6 3 2006-01-15 a 25
7 3 2006-03-22 a 25
8 3 2006-09-18 e 29
9 4 2006-03-04 b 26
10 4 2006-03-04 a 25
11 4 2006-08-22 c 27
12 4 2006-08-22 a 25
13 5 2006-04-11 a 25
14 5 2006-04-11 b 26
15 5 2006-10-19 a 25
I need something like this:
id dstart1 offence cod_offence offence2
1 1 2006-11-25 a 25 NA
2 1 2006-12-13 b 26 c
3 1 2006-12-13 c 27 NA
4 2 2006-02-07 b 26 d
5 2 2006-02-07 d 28 NA
6 3 2006-01-15 a 25 NA
7 3 2006-03-22 a 25 NA
8 3 2006-09-18 e 29 NA
9 4 2006-03-04 b 26 a
10 4 2006-03-04 a 25 NA
11 4 2006-08-22 c 27 a
12 4 2006-08-22 a 25 NA
13 5 2006-04-11 a 25 b
14 5 2006-04-11 b 26 NA
15 5 2006-10-19 a 25 NA
I think that I need to do something like this:
given i=individual
j=observation within individual
for each individual I need to check whether mydata$dstart1(j) = mydata$dstart1(j+1)
if this is true, then copy mydata$offence2(j)=mydata$offence(j+1), otherwise keep the same value
This has to stop if id(j) != id(j+1) and re-start with the new id.
My problem is that I don't know how to put this in a loop.
Thank you!!
Update
Yes, it'w works fine with the example, but not yet with my real data, since they are a little bit more complex
What happen if instead of two repeated dates I have three or more? each one of them with different offences. Following #CathG solution, I need to create more variables according to the number of offences (in my case 8), I guess I would need a new vector that identify the position of the observation within id and a new "instruction" that tell R that depending of the position of the mydata$dstart1, the value need to be copied in a different column. But then again, I don't know how to do it.
id dstart1 offence cod_offence offence2 offence3 offence4
1 1 2006-11-25 a 25 NA NA NA
2 1 2006-12-13 b 26 c NA NA
3 1 2006-12-13 c 27 NA NA NA
4 2 2006-02-07 b 26 d NA NA
5 2 2006-02-07 d 28 NA NA NA
6 2 2006-04-12 b 26 d c a
7 2 2006-04-12 d 28 NA NA NA
8 2 2006-04-12 c 27 NA NA NA
9 2 2006-04-12 a 25 NA NA NA
Thanks again!!!
With splitand a loop :
# data with repeated dates /offences
id<-c(1,1,1,2,2,3,3,3,4,4,4,4,5,5,5,5,5,5)
dstart<-c("25/11/2006", "13/12/2006","13/12/2006","07/02/2006","07/02/2006",
"15/01/2006", "22/03/2006","18/09/2006", "04/03/2006","04/03/2006",
"22/08/2006","22/08/2006","11/04/2006", "11/04/2006", "19/10/2006","19/10/2006","19/10/2006","19/10/2006")
dstart1<-as.Date(dstart, "%d/%m/%Y")
offence<-c("a","b","c","b","d","a","a","e","b","a","c","a","a","b","a","c","b","a")
cod_offence<-c(25, 26,27,26,28,25,25,29,26,25,27,25,25,26,25,27,25,25)
mydata<-data.frame(id, dstart1, offence, cod_offence)
# see the max offences there are for same id and date
maxoff<-max(table(mydata$id,mydata$dstart1))
mydata[,paste("offence",2:maxoff,sep="")]<-NA
# split your data according to id
splitmydata<-split(mydata,mydata$id)
# for each "per id dataset", apply a function that looks for repeated offences / dates and fill the "offences" variables in the row with first occurence of specific date.
splitmydata2<-lapply(splitmydata,
function(tab){
for(datestart in unique(tab[,"dstart1"])){
ind_date<-sort(which(tab[,"dstart1"]==datestart))
if(length(ind_date[-1])){
tab[ind_date[1],grep("^offence",colnames(tab),value=T)[2:(length(ind_date))]]<-as.character(tab[ind_date[-1],"offence"])
}
}
return(tab)
}
)
mydata2<-unsplit(splitmydata2,mydata$id) # finally, unsplit your data
> mydata2
id dstart1 offence cod_offence offence2 offence3 offence4
1 1 2006-11-25 a 25 <NA> <NA> <NA>
2 1 2006-12-13 b 26 c <NA> <NA>
3 1 2006-12-13 c 27 <NA> <NA> <NA>
4 2 2006-02-07 b 26 d <NA> <NA>
5 2 2006-02-07 d 28 <NA> <NA> <NA>
6 3 2006-01-15 a 25 <NA> <NA> <NA>
7 3 2006-03-22 a 25 <NA> <NA> <NA>
8 3 2006-09-18 e 29 <NA> <NA> <NA>
9 4 2006-03-04 b 26 a <NA> <NA>
10 4 2006-03-04 a 25 <NA> <NA> <NA>
11 4 2006-08-22 c 27 a <NA> <NA>
12 4 2006-08-22 a 25 <NA> <NA> <NA>
13 5 2006-04-11 a 25 b <NA> <NA>
14 5 2006-04-11 b 26 <NA> <NA> <NA>
15 5 2006-10-19 a 25 c b a
16 5 2006-10-19 c 27 <NA> <NA> <NA>
17 5 2006-10-19 b 25 <NA> <NA> <NA>
18 5 2006-10-19 a 25 <NA> <NA> <NA>
You can use base R
indx <- with(mydata, ave(as.numeric(dstart1), id,
FUN=function(x) c(x[-1]==x[-length(x)], FALSE)))
transform(mydata, offence2=ifelse(!!indx,
c(as.character(offence[-1]), NA), NA))
Or using dplyr
library(dplyr)
mydata %>%
group_by(id) %>%
mutate(offence2= dstart1==lead(dstart1),
offence2= ifelse(!is.na(offence2)&offence2,
as.character(lead(offence)), NA_character_))
# id dstart1 offence cod_offence offence2
#1 1 2006-11-25 a 25 NA
#2 1 2006-12-13 b 26 c
#3 1 2006-12-13 c 27 NA
#4 2 2006-02-07 b 26 d
#5 2 2006-02-07 d 28 NA
#6 3 2006-01-15 a 25 NA
#7 3 2006-03-22 a 25 NA
#8 3 2006-09-18 e 29 NA
#9 4 2006-03-04 b 26 a
#10 4 2006-03-04 a 25 NA
#11 4 2006-08-22 c 27 a
#12 4 2006-08-22 a 25 NA
#13 5 2006-04-11 a 25 b
#14 5 2006-04-11 b 26 NA
#15 5 2006-10-19 a 25 NA
or using data.table
library(data.table)
setDT(mydata)[, indx:=c(dstart1[-1]==dstart1[-.N], FALSE), by=id][,
offence2:=ifelse(indx, as.character(offence)[which(indx)+1],
NA_character_), by=id][,indx:=NULL]
mydata
# id dstart1 offence cod_offence offence2
#1: 1 2006-11-25 a 25 NA
#2: 1 2006-12-13 b 26 c
#3: 1 2006-12-13 c 27 NA
#4: 2 2006-02-07 b 26 d
#5: 2 2006-02-07 d 28 NA
#6: 3 2006-01-15 a 25 NA
#7: 3 2006-03-22 a 25 NA
#8: 3 2006-09-18 e 29 NA
#9: 4 2006-03-04 b 26 a
#10: 4 2006-03-04 a 25 NA
#11: 4 2006-08-22 c 27 a
#12: 4 2006-08-22 a 25 NA
#13: 5 2006-04-11 a 25 b
#14: 5 2006-04-11 b 26 NA
#15: 5 2006-10-19 a 25 NA
Update
Using the new dataset mydata2 and if you use the first method, we get d1
indx <- with(mydata2, ave(as.numeric(dstart1), id,
FUN=function(x) c(x[-1]==x[-length(x)], FALSE)))
d1 <- transform(mydata2, offence2=ifelse(!!indx,
c(as.character(offence[-1]), NA), NA))
From d1, we can create an indx column and then use dcast to convert from long form to wide for the column offence2. If there are columns with all NAs, we can remove that by using colSums(is.na(. Rename the columns, and then use mutate_each from dplyr to sort the columns, and finally cbind it with mydata2
d1$indx <- with(d1, ave(seq_along(id), id, dstart1, FUN=seq_along))
library(reshape2)
d2 <- dcast(d1, id + dstart1+indx~indx, value.var='offence2')
d2New <- d2[,colSums(is.na(d2))!=nrow(d2)]
nm1 <- grep("^\\d",colnames(d2New))
colnames(d2New)[nm1] <- paste0('offence', 2:(length(nm1)+1))
d3 <- d2New[,-3] %>%
group_by(id, dstart1) %>%
mutate_each(funs(.[order(.)])) %>%
ungroup()
cbind(mydata,d3[,-c(1:2)])
# id dstart1 offence cod_offence offence2 offence3 offence4
#1 1 2006-11-25 a 25 <NA> <NA> <NA>
#2 1 2006-12-13 b 26 c <NA> <NA>
#3 1 2006-12-13 c 27 <NA> <NA> <NA>
#4 2 2006-02-07 b 26 d <NA> <NA>
#5 2 2006-02-07 d 28 <NA> <NA> <NA>
#6 2 2006-04-12 b 26 d c a
#7 2 2006-04-12 d 28 <NA> <NA> <NA>
#8 2 2006-04-12 c 27 <NA> <NA> <NA>
#9 2 2006-04-12 a 25 <NA> <NA> <NA>
data
mydata <- structure(list(id = c(1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5,
5, 5), dstart1 = structure(c(13477, 13495, 13495, 13186, 13186,
13163, 13229, 13409, 13211, 13211, 13382, 13382, 13249, 13249,
13440), class = "Date"), offence = structure(c(1L, 2L, 3L, 2L,
4L, 1L, 1L, 5L, 2L, 1L, 3L, 1L, 1L, 2L, 1L), .Label = c("a",
"b", "c", "d", "e"), class = "factor"), cod_offence = c(25, 26,
27, 26, 28, 25, 25, 29, 26, 25, 27, 25, 25, 26, 25)), .Names = c("id",
"dstart1", "offence", "cod_offence"), row.names = c(NA, -15L),
class = "data.frame")
mydata2 <- structure(list(id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L),
dstart1 = structure(c(13477, 13495, 13495, 13186, 13186, 13250, 13250,
13250, 13250), class = "Date"), offence = c("a", "b", "c", "b", "d", "b",
"d", "c", "a"), cod_offence = c(25L, 26L, 27L, 26L, 28L, 26L, 28L, 27L, 25L
)), .Names = c("id", "dstart1", "offence", "cod_offence"), row.names =
c("1","2", "3", "4", "5", "6", "7", "8", "9"), class = "data.frame")