Within-cell operations in matching R data frames - r

I have two matching data frames in R, one has strings separated by underscores and the other has matching counts for the strings also separated by underscores. Here are some excerpts:
strings_df:
SampleName V1 V2
asd A/I/R S/G
asd A/I/R NA
afsdf A/I/R_A/I/R_V/I/R S/G_A/A_C/A_F/A
dsg A/I/R S/G
eee A/I/R S/G
shg NA NA
gfdf A/I/R_A/I/R_A/I/R NA
counts_df:
SampleName V1 V2
asd 53 35
asd 66 NA
afsdf 78_80_100 66_55_30
dsg 99 65
eee 64 43
shg NA NA
gfdf 46/47/88 NA
Whenever there are two identical strings in strings_df (such as A/I/R in A/I/R_A/I/R_V/I/R) I need to sum the corresponding counts in counts_df and only leave a single instance of the string in strings_df, otherwise data should be left as is.
The output data frames for this example would be:
resulting_strings_df:
SampleName V1 V2
asd A/I/R S/G
asd A/I/R NA
afsdf A/I/R_V/I/R S/G_A/A_C/A_F/A
dsg A/I/R S/G
eee A/I/R S/G
shg NA NA
gfdf A/I/R NA
resulting_counts_df:
SampleName V1 V2
asd 53 35
asd 66 NA
afsdf 158_100 66_55_30
dsg 99 65
eee 64 43
shg NA NA
gfdf 181 NA
I've tried separating the columns with separate() using the underscore as separator but because the number of underscores on cells is uneven, it ends up being a total mess. I'm stuck at that point at the moment.

An approach with dplyr, assuming rows between the datasets correspond.
First combine the data after renaming the columns, then detect duplicates and sum/paste respectively.
library(dplyr)
df_new <- tibble(strings_df %>% rename_with(function(x) paste0(x, ".X")),
counts_df %>% rename_with(function(x) paste0(x, ".Y"))) %>%
mutate(spli = Vectorize(\(x) as.numeric(x))(strsplit(V1.Y, "_|/")),
spli_chr = ifelse(lengths(spli) > 1, strsplit(V1.X, "_"), V1.X)) %>%
rowwise() %>%
mutate(is = list(duplicated(spli_chr) | duplicated(spli_chr, fromLast=T)),
spli = paste0(sum(spli[is]), "_", spli[!is]),
V1.X = paste(unique(spli_chr), collapse="_"),
V1.Y = gsub("^_|0_|_0|_$", "", spli)) %>%
select(-c(spli, spli_chr, is)) %>%
ungroup()
Get the single data sets after regaining the original colnames.
df_new %>%
select(ends_with(".X")) %>%
rename_with(function(x) colnames(strings_df))
# A tibble: 7 × 3
SampleName V1 V2
<chr> <chr> <chr>
1 asd A/I/R S/G
2 asd A/I/R NA
3 afsdf A/I/R_V/I/R S/G_A/A_C/A_F/A
4 dsg A/I/R S/G
5 eee A/I/R S/G
6 shg NA NA
7 gfdf A/I/R NA
df_new %>%
select(ends_with(".Y")) %>%
rename_with(function(x) colnames(counts_df))
# A tibble: 7 × 3
SampleName V1 V2
<chr> <chr> <chr>
1 asd 53 35
2 asd 66 NA
3 afsdf 158_100 66_55_30
4 dsg 99 65
5 eee 64 43
6 shg NA NA
7 gfdf 181 NA

I would suggest combining the two datasets to make sure that they're aligned before splitting them into the desired frames.
In one go using group_split:
library(tidyr)
library(dplyr)
strings_df |>
pivot_longer(-SampleName, values_to = "string") |>
left_join(counts_df |> pivot_longer(-SampleName, values_to = "count")) |>
separate_longer_delim(c(string, count), delim = "_") |>
reframe(count = sum(as.numeric(count)), .by = c(SampleName, name, string)) |>
reframe(across(c(string, count), ~ paste(., collapse = "_")), .by = c(SampleName, name)) |>
pivot_longer(c(string, count), names_to = "split") |>
pivot_wider(id_cols = c(SampleName, split)) |>
group_split(split)
Output:
# A tibble: 7 × 4
SampleName split V1 V2
<chr> <chr> <chr> <chr>
1 asd1 count 53 35
2 asd2 count 66 NA
3 afsdf count 158_100 66_55_30
4 dsg count 99 65
5 eee count 64 43
6 shg count NA NA
7 gfdf count 181 NA
[[2]]
# A tibble: 7 × 4
SampleName split V1 V2
<chr> <chr> <chr> <chr>
1 asd1 string A/I/R S/G
2 asd2 string A/I/R NA
3 afsdf string A/I/R_V/I/R S/G_A/A_C/A
4 dsg string A/I/R S/G
5 eee string A/I/R S/G
6 shg string NA NA
7 gfdf string A/I/R NA
... Or you could do it a bit more manually ...
Step 1: The combined data frame:
library(tidyr)
library(dplyr)
combined_df <-
strings_df |>
pivot_longer(-SampleName, values_to = "string") |>
left_join(counts_df |> pivot_longer(-SampleName, values_to = "count")) |>
separate_longer_delim(c(string, count), delim = "_") |>
reframe(count = sum(as.numeric(count)), .by = c(SampleName, name, string)) |>
reframe(across(c(string, count), ~ paste(., collapse = "_")), .by = c(SampleName, name))
Output:
SampleName name string count
<chr> <chr> <chr> <chr>
1 asd1 V1 A/I/R 53
2 asd1 V2 S/G 35
3 asd2 V1 A/I/R 66
4 asd2 V2 NA NA
5 afsdf V1 A/I/R_V/I/R 158_100
6 afsdf V2 S/G_A/A_C/A 66_55_30
7 dsg V1 A/I/R 99
8 dsg V2 S/G 65
9 eee V1 A/I/R 64
10 eee V2 S/G 43
11 shg V1 NA NA
12 shg V2 NA NA
13 gfdf V1 A/I/R 181
14 gfdf V2 NA NA
Step 2: Get the two data frames you want:
resulting_strings_df <-
combined_df |>
pivot_wider(id_cols = SampleName, values_from = "string")
resulting_counts_df <-
combined_df |>
pivot_wider(id_cols = SampleName, values_from = "count")
Output:
SampleName V1 V2
<chr> <chr> <chr>
1 asd1 A/I/R S/G
2 asd2 A/I/R NA
3 afsdf A/I/R_V/I/R S/G_A/A_C/A
4 dsg A/I/R S/G
5 eee A/I/R S/G
6 shg NA NA
7 gfdf A/I/R NA
SampleName V1 V2
<chr> <chr> <chr>
1 asd1 53 35
2 asd2 66 NA
3 afsdf 158_100 66_55_30
4 dsg 99 65
5 eee 64 43
6 shg NA NA
7 gfdf 181 NA
Assuming that:
samplenames are unique per now.
_ is the separater in counts_df
the number of underscores per SampleName doesn't vary across the two dataframes.
Adjusted data:
library(readr)
strings_df <- read_table("SampleName V1 V2
asd1 A/I/R S/G
asd2 A/I/R NA
afsdf A/I/R_A/I/R_V/I/R S/G_A/A_C/A
dsg A/I/R S/G
eee A/I/R S/G
shg NA NA
gfdf A/I/R_A/I/R_A/I/R NA")
counts_df <- read_table("SampleName V1 V2
asd1 53 35
asd2 66 NA
afsdf 78_80_100 66_55_30
dsg 99 65
eee 64 43
shg NA NA
gfdf 46_47_88 NA")

library(tidyverse)
pivot_longer(strings_df, V1:V2, values_to = 'string') %>%
left_join(pivot_longer(counts_df, V1:V2)) %>%
separate_rows(c(string, value),sep = '_', convert = TRUE) %>%
summarise(value=sum(value), .by=c(SampleName, name, string)) %>%
pivot_wider(id_cols = SampleName, values_from = c(string, value),
values_fn = ~str_c(.x, collapse = '_'))
#> # A tibble: 7 × 5
#> SampleName string_V1 string_V2 value_V1 value_V2
#> <chr> <chr> <chr> <chr> <chr>
#> 1 asd1 A/I/R S/G 53 35
#> 2 asd2 A/I/R <NA> 66 <NA>
#> 3 afsdf A/I/R_V/I/R S/G_A/A_C/A 158_100 66_55_30
#> 4 dsg A/I/R S/G 99 65
#> 5 eee A/I/R S/G 64 43
#> 6 shg <NA> <NA> <NA> <NA>
#> 7 gfdf A/I/R <NA> 181 <NA>
Created on 2023-02-19 with reprex v2.0.2

Related

Collapse data frame so NAs are removed

I want to collapse this data frame so NA's are removed. How to accomplish this? Thanks!!
id <- c(1,1,1,2,2,3,4,5,5)
q1 <- c(23,55,7,88,90,34,11,22,99)
df <- data.frame(id,q1)
df$row <- 1:nrow(df)
spread(df, id, q1)
row 1 2 3 4 5
1 23 NA NA NA NA
2 55 NA NA NA NA
3 7 NA NA NA NA
4 NA 88 NA NA NA
5 NA 90 NA NA NA
6 NA NA 34 NA NA
7 NA NA NA 11 NA
8 NA NA NA NA 22
9 NA NA NA NA 89
I want it to look like this:
1 2 3 4 5
23 88 34 11 22
55 90 NA NA 89
7 NA NA NA NA
::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
The row should be created on the sequence of 'id'. In addition, pivot_wider would be a more general function compared to spread
library(dplyr)
library(tidyr)
df %>%
group_by(id) %>%
mutate(row = row_number()) %>%
ungroup %>%
pivot_wider(names_from = id, values_from = q1) %>%
select(-row)
-output
# A tibble: 3 × 5
`1` `2` `3` `4` `5`
<dbl> <dbl> <dbl> <dbl> <dbl>
1 23 88 34 11 22
2 55 90 NA NA 99
3 7 NA NA NA NA
Or use dcast
library(data.table)
dcast(setDT(df), rowid(id) ~ id, value.var = 'q1')[, id := NULL][]
1 2 3 4 5
<num> <num> <num> <num> <num>
1: 23 88 34 11 22
2: 55 90 NA NA 99
3: 7 NA NA NA NA
Here's a base R solution. I sort each column so the non-NA values are at the top, find the number of non-NA values in the column with the most non-NA values (n), and return the top n rows from the data frame.
library(tidyr)
id <- c(1,1,1,2,2,3,4,5,5)
q1 <- c(23,55,7,88,90,34,11,22,99)
df <- data.frame(id,q1)
df$row <- 1:nrow(df)
df <- spread(df, id, q1)
collapse_df <- function(df) {
move_na_to_bottom <- function(x) x[order(is.na(x))]
sorted <- sapply(df, move_na_to_bottom)
count_non_na <- function(x) sum(!is.na(x))
n <- max(apply(df, 2, count_non_na))
sorted[1:n, ]
}
collapse_df(df[, -1])

Converting long to wide when some observations have more than one time variable

I have a dataset where I just want each ID to have one row. Some of them have 2 of a different ID, and I want to make the dataset wide like in the sample output below.
ID. other_ID. height color
44 57 56 blue
32 99 28 green
66 23 19 yellow
66 56 5 purple
80 43 7 green
ID other_ID other_ID2 height1 height2 color1 color2
44 57 NA 56 NA blue NA
32 99 NA 28 NA green NA
66 23 56 19 5 yellow purple
80 43 NA 7 NA green NA
df <- data.frame(
"ID." = c(44, 32, 66, 66, 80),
"other_ID." = c(57, 99, 23, 56, 43),
"height" = c(56, 28, 19, 5, 7),
"color" = c("blue", "green", "yellow", "purple", "green")
)
Here’s an idea using tidyverse. The idea is to use the non-unique ID column to create a count of the number of observations within that ID, then use that count to convert the data to wide format.
library(tidyverse)
df_wide <-
df %>%
group_by(ID.) %>%
mutate(obs = row_number()) %>%
pivot_wider(id_cols = ID., values_from = c(other_ID., height, color), names_from = obs)
df_wide
#> # A tibble: 4 x 7
#> # Groups: ID. [4]
#> ID. other_ID._1 other_ID._2 height_1 height_2 color_1 color_2
#> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
#> 1 44 57 NA 56 NA blue <NA>
#> 2 32 99 NA 28 NA green <NA>
#> 3 66 23 56 19 5 yellow purple
#> 4 80 43 NA 7 NA green <NA>
You can use a simple pivot_wider:
df %>%
group_by(ID) %>%
mutate(count_id = 1:n()) %>%
ungroup() %>%
pivot_wider(values_from = c(other_ID, height, color), names_from = count_id)
which gives:
# A tibble: 4 x 7
ID other_ID_1 other_ID_2 height_1 height_2 color_1 color_2
<int> <int> <int> <int> <int> <chr> <chr>
1 44 57 NA 56 NA blue <NA>
2 32 99 NA 28 NA green <NA>
3 66 23 56 19 5 yellow purple
4 80 43 NA 7 NA green <NA>
in Base R:
reshape(transform(df, time = ave(ID, ID, FUN = seq_along)), dir = 'wide', idvar = 'ID', sep="")
ID other_ID1 height1 color1 other_ID2 height2 color2
1 44 57 56 blue NA NA <NA>
2 32 99 28 green NA NA <NA>
3 66 23 19 yellow 56 5 purple
5 80 43 7 green NA NA <NA>
in Data.table:
library(data.table)
dcast(setDT(df), ID~rowid(ID), value.var = names(df)[-1], sep = "")
ID other_ID1 other_ID2 height1 height2 color1 color2
1: 32 99 NA 28 NA green <NA>
2: 44 57 NA 56 NA blue <NA>
3: 66 23 56 19 5 yellow purple
4: 80 43 NA 7 NA green <NA>
A solution based on data.table:
library(data.table)
df <- data.frame(
stringsAsFactors = FALSE,
ID = c(44L, 32L, 66L, 66L, 80L),
other_ID = c(57L, 99L, 23L, 56L, 43L),
height = c(56L, 28L, 19L, 5L, 7L),
color = c("blue", "green", "yellow", "purple", "green")
)
dcast(setDT(df), ID ~ rowid(ID), value.var = names(df)[-1])
#> ID other_ID_1 other_ID_2 height_1 height_2 color_1 color_2
#> 1: 32 99 NA 28 NA green <NA>
#> 2: 44 57 NA 56 NA blue <NA>
#> 3: 66 23 56 19 5 yellow purple
#> 4: 80 43 NA 7 NA green <NA>
A solution based on nest and unnest_wider:
library(tidyverse)
df %>%
nest(data = c(other_ID, height, color)) %>%
unnest_wider(data) %>% unnest_wider(other_ID, names_sep = "") %>%
unnest_wider(height, names_sep = "") %>% unnest_wider(color, names_sep = "")
#> # A tibble: 4 × 7
#> ID other_ID1 other_ID2 height1 height2 color1 color2
#> <int> <int> <int> <int> <int> <chr> <chr>
#> 1 44 57 NA 56 NA blue <NA>
#> 2 32 99 NA 28 NA green <NA>
#> 3 66 23 56 19 5 yellow purple
#> 4 80 43 NA 7 NA green <NA>
Another possible solution:
library(tidyverse)
df %>%
group_by(ID) %>%
summarise(across(everything(), ~ str_c(., collapse = ",")), .groups ="drop") %>%
reduce(names(.)[-1], function(x,y)
separate(x,y, into=str_c(y,1:2), sep=",", fill="right", convert=T), .init=.)
#> # A tibble: 4 × 7
#> ID other_ID1 other_ID2 height1 height2 color1 color2
#> <int> <int> <int> <int> <int> <chr> <chr>
#> 1 32 99 NA 28 NA green <NA>
#> 2 44 57 NA 56 NA blue <NA>
#> 3 66 23 56 19 5 yellow purple
#> 4 80 43 NA 7 NA green <NA>

Merge/combine rows with same ID and Date in R

I have an excel database like below. The Excel database had option to enter only 3 drug details. Wherever there are more than 3 drugs, it has been entered into another row with PID and Date.
Is there a way I can merge the rows in R so that each patient's records will be in a single row? In the example below, I need to merge Row 1 & 2 and 4 & 6.
Thanks.
Row
PID
Date
Drug1
Dose1
Drug2
Dose2
Drug3
Dose3
Age
Place
1
11A
25/10/2021
RPG
12
NAT
34
QRT
5
45
PMk
2
11A
25/10/2021
BET
10
SET
43
BLT
45
3
12B
20/10/2021
ATY
13
LTP
3
CRT
3
56
GTL
4
13A
22/10/2021
GGS
7
GSF
12
ERE
45
45
RKS
5
13A
26/10/2021
BRT
9
ARR
4
GSF
34
46
GLO
6
13A
22/10/2021
DFS
5
7
14B
04/08/2021
GDS
2
TRE
55
HHS
34
25
MTK
Up front, the two methods below are completely different, not equivalents in "base R vs dplyr". I'm sure either can be translated to the other.
dplyr
The premise here is to first reshape/pivot the data longer so that each Drug/Dose is on its own line, renumber them appropriately, and then bring it back to a wide state.
NOTE: frankly, I usually prefer to deal with data in a long format, so consider keeping it in its state immediately before pivot_wider. This means you'd need to bring Age and Place back into it somehow.
Why? A long format deals very well with many types of aggregation; ggplot2 really really prefers data in the long format; I dislike seeing and having to deal with all of the NA/empty values that will invariably happen with this wide format, since many PIDs don't have (e.g.) Drug6 or later. This seems subjective, but it can really be an objective change/improvement to data-mangling, depending on your workflow.
library(dplyr)
# library(tidyr) # pivot_longer, pivot_wider
dat0 <- select(dat, PID, Date, Age, Place) %>%
group_by(PID, Date) %>%
summarize(across(everything(), ~ .[!is.na(.) & nzchar(trimws(.))][1] ))
dat %>%
select(-Age, -Place) %>%
tidyr::pivot_longer(
-c(Row, PID, Date),
names_to = c(".value", "iter"),
names_pattern = "^([^0-9]+)([123]?)$") %>%
arrange(Row, iter) %>%
group_by(PID, Date) %>%
mutate(iter = row_number()) %>%
select(-Row) %>%
tidyr::pivot_wider(
c("PID", "Date"), names_sep = "",
names_from = "iter", values_from = c("Drug", "Dose")) %>%
left_join(dat0, by = c("PID", "Date"))
# # A tibble: 5 x 16
# # Groups: PID, Date [5]
# PID Date Drug1 Drug2 Drug3 Drug4 Drug5 Drug6 Dose1 Dose2 Dose3 Dose4 Dose5 Dose6 Age Place
# <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <int> <int> <int> <int> <int> <int> <int> <chr>
# 1 11A 25/10/2021 RPG NAT QRT BET "SET" "BLT" 12 34 5 10 43 45 45 PMk
# 2 12B 20/10/2021 ATY LTP CRT <NA> <NA> <NA> 13 3 3 NA NA NA 56 GTL
# 3 13A 22/10/2021 GGS GSF ERE DFS "" "" 7 12 45 5 NA NA 45 RKS
# 4 13A 26/10/2021 BRT ARR GSF <NA> <NA> <NA> 9 4 34 NA NA NA 46 GLO
# 5 14B 04/08/2021 GDS TRE HHS <NA> <NA> <NA> 2 55 34 NA NA NA 25 MTK
Notes:
I broke out dat0 early, since Age and Place don't really fit into the pivot/renumber/pivot mindset.
base R
Here's a base R method that splits (according to your grouping criteria: PID and Date), finds the Drug/Dose columns that need to be renumbered, renames them, and the merges all of the frames back together.
spl <- split(dat, ave(rep(1L, nrow(dat)), dat[,c("PID", "Date")], FUN = seq_along))
spl
# $`1`
# Row PID Date Drug1 Dose1 Drug2 Dose2 Drug3 Dose3 Age Place
# 1 1 11A 25/10/2021 RPG 12 NAT 34 QRT 5 45 PMk
# 3 3 12B 20/10/2021 ATY 13 LTP 3 CRT 3 56 GTL
# 4 4 13A 22/10/2021 GGS 7 GSF 12 ERE 45 45 RKS
# 5 5 13A 26/10/2021 BRT 9 ARR 4 GSF 34 46 GLO
# 7 7 14B 04/08/2021 GDS 2 TRE 55 HHS 34 25 MTK
# $`2`
# Row PID Date Drug1 Dose1 Drug2 Dose2 Drug3 Dose3 Age Place
# 2 2 11A 25/10/2021 BET 10 SET 43 BLT 45 NA
# 6 6 13A 22/10/2021 DFS 5 NA NA NA
nms <- lapply(spl, function(x) grep("^(Drug|Dose)", colnames(x), value = TRUE))
nms <- data.frame(i = rep(names(nms), lengths(nms)), oldnm = unlist(nms))
nms$grp <- gsub("[0-9]+$", "", nms$oldnm)
nms$newnm <- paste0(nms$grp, ave(nms$grp, nms$grp, FUN = seq_along))
nms <- split(nms, nms$i)
newspl <- Map(function(x, nm) {
colnames(x)[ match(nm$oldnm, colnames(x)) ] <- nm$newnm
x
}, spl, nms)
newspl[-1] <- lapply(newspl[-1], function(x) x[, c("PID", "Date", grep("^(Drug|Dose)", colnames(x), value = TRUE)), drop = FALSE ])
newspl
# $`1`
# Row PID Date Drug1 Dose1 Drug2 Dose2 Drug3 Dose3 Age Place
# 1 1 11A 25/10/2021 RPG 12 NAT 34 QRT 5 45 PMk
# 3 3 12B 20/10/2021 ATY 13 LTP 3 CRT 3 56 GTL
# 4 4 13A 22/10/2021 GGS 7 GSF 12 ERE 45 45 RKS
# 5 5 13A 26/10/2021 BRT 9 ARR 4 GSF 34 46 GLO
# 7 7 14B 04/08/2021 GDS 2 TRE 55 HHS 34 25 MTK
# $`2`
# PID Date Drug4 Dose4 Drug5 Dose5 Drug6 Dose6
# 2 11A 25/10/2021 BET 10 SET 43 BLT 45
# 6 13A 22/10/2021 DFS 5 NA NA
Reduce(function(a, b) merge(a, b, by = c("PID", "Date"), all = TRUE), newspl)
# PID Date Row Drug1 Dose1 Drug2 Dose2 Drug3 Dose3 Age Place Drug4 Dose4 Drug5 Dose5 Drug6 Dose6
# 1 11A 25/10/2021 1 RPG 12 NAT 34 QRT 5 45 PMk BET 10 SET 43 BLT 45
# 2 12B 20/10/2021 3 ATY 13 LTP 3 CRT 3 56 GTL <NA> NA <NA> NA <NA> NA
# 3 13A 22/10/2021 4 GGS 7 GSF 12 ERE 45 45 RKS DFS 5 NA NA
# 4 13A 26/10/2021 5 BRT 9 ARR 4 GSF 34 46 GLO <NA> NA <NA> NA <NA> NA
# 5 14B 04/08/2021 7 GDS 2 TRE 55 HHS 34 25 MTK <NA> NA <NA> NA <NA> NA
Notes:
The underlying premise of this is that you want to merge the rows onto previous rows. This means (to me) using base::merge or dplyr::full_join; two good links for understanding these concepts, in case you are not aware: How to join (merge) data frames (inner, outer, left, right), What's the difference between INNER JOIN, LEFT JOIN, RIGHT JOIN and FULL JOIN?
To do that, we need to determine which rows are duplicates of previous; further, we need to know how many previous same-key rows there are. There are a few ways to do this, but I think the easiest is with base::split. In this case, no PID/Date combination has more than two rows, but if you had one combination that mandated a third row, spl would be length-3, and the resulting names would go out to Drug9/Dose9.
The second portion (nms <- ...) is where we work on the names. The first few steps create a nms dataframe that we'll use to map from old to new names. Since we're concerned about contiguous numbering through all multi-row groups, we aggregate on the base (number removed) of the Drug/Dose names, so that we number all Drug columns from Drug1 through how many there are.
Note: this assumes that there are always perfect pairs of Drug#/Dose#; if there is ever a mismatch, then the numbering will be suspect.
We end with nms being a split dataframe, just like spl of the data. This is useful and important, since we'll Map (zip-like lapply) them together.
The third block updates spl with the new names. The result in newspl is just renaming of the columns so that when we merge them together, no column-duplication will occur.
One additional step here is removing unrelated columns from the 2nd and subsequent frame in the list. That is, we keep Age and Place in the first such frame but remove it from the rest. My assumption (based on the NA/empty nature of those fields in duplicate rows) is that we only want to keep the first row's values.
The last step is to iteratively merge them together. The Reduce function is nice for this.
Update:
With the help of akrun see here: Use ~separate after mutate and across
We could:
library(dplyr)
library(stringr)
library(tidyr)
df %>%
group_by(PID) %>%
summarise(across(everything(), ~toString(.))) %>%
mutate(across(everything(), ~ list(tibble(col1 = .) %>%
separate(col1, into = str_c(cur_column(), 1:3), sep = ",\\s+", fill = "left", extra = "drop")))) %>%
unnest(c(PID, Row, Date, Drug1, Dose1, Drug2, Dose2, Drug3, Dose3, Age,
Place)) %>%
distinct() %>%
select(-1, -2)
PID3 Row1 Row2 Row3 Date1 Date2 Date3 Drug11 Drug12 Drug13 Dose11 Dose12 Dose13 Drug21 Drug22 Drug23 Dose21 Dose22 Dose23 Drug31 Drug32 Drug33 Dose31 Dose32 Dose33 Age1 Age2 Age3 Place1 Place2 Place3
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 11A NA 1 2 NA 25/10/2021 25/10/2021 NA RPG BET NA 12 10 NA NAT SET NA 34 43 NA QRT BLT NA 5 45 NA 45 NA NA PMk NA
2 12B NA NA 3 NA NA 20/10/2021 NA NA ATY NA NA 13 NA NA LTP NA NA 3 NA NA CRT NA NA 3 NA NA 56 NA NA GTL
3 13A 4 5 6 22/10/2021 26/10/2021 22/10/2021 GGS BRT DFS 7 9 5 GSF ARR NA 12 4 NA ERE GSF NA 45 34 NA 45 46 NA RKS GLO NA
4 14B NA NA 7 NA NA 04/08/2021 NA NA GDS NA NA 2 NA NA TRE NA NA 55 NA NA HHS NA NA 34 NA NA 25 NA NA MTK
First answer:
Keeping the excellent explanation of #r2evans in mind! We could do it this way if really desired.
library(dplyr)
df %>%
group_by(PID) %>%
summarise(across(everything(), ~toString(.)))
output:
PID Row Date Drug1 Dose1 Drug2 Dose2 Drug3 Dose3 Age Place
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 11A 1, 2 25/10/2021, 25/10/2021 RPG, BET 12, 10 NAT, SET 34, 43 QRT, BLT 5, 45 45, NA PMk, NA
2 12B 3 20/10/2021 ATY 13 LTP 3 CRT 3 56 GTL
3 13A 4, 5, 6 22/10/2021, 26/10/2021, 22/10/2021 GGS, BRT, DFS 7, 9, 5 GSF, ARR, NA 12, 4, NA ERE, GSF, NA 45, 34, NA 45, 46, NA RKS, GLO, NA
4 14B 7 04/08/2021 GDS 2 TRE 55 HHS 34 25 MTK
Another tidyverse-based solution, with a pivot_longer followed by a pivot_wider:
library(tidyverse)
# Note that my dataframe does not contain column Row
df %>%
mutate(across(starts_with("Dose"), as.character)) %>%
pivot_longer(!c(PID, Date, Age, Place),names_to = "trm") %>%
group_by(PID, Date) %>%
fill(Age, Place) %>%
mutate(trm = paste(trm,1:n(),sep="_")) %>%
ungroup %>%
pivot_wider(c(PID, Date, Age, Place), names_from = trm) %>%
rename_with(~ paste0("Drug",1:length(.x)), starts_with("Drug")) %>%
rename_with(~ paste0("Dose",1:length(.x)), starts_with("Dose")) %>%
mutate(across(starts_with("Dose"), as.numeric))
#> # A tibble: 5 × 16
#> PID Date Age Place Drug1 Dose1 Drug2 Dose2 Drug3 Dose3 Drug4 Dose4 Drug5
#> <chr> <chr> <int> <chr> <chr> <dbl> <chr> <dbl> <chr> <dbl> <chr> <dbl> <chr>
#> 1 11A 25/10… 45 PMk RPG 12 NAT 34 QRT 5 BET 10 SET
#> 2 12B 20/10… 56 GTL ATY 13 LTP 3 CRT 3 <NA> NA <NA>
#> 3 13A 22/10… 45 RKS GGS 7 GSF 12 ERE 45 DFS 5 <NA>
#> 4 13A 26/10… 46 GLO BRT 9 ARR 4 GSF 34 <NA> NA <NA>
#> 5 14B 04/08… 25 MTK GDS 2 TRE 55 HHS 34 <NA> NA <NA>
#> # … with 3 more variables: Dose5 <dbl>, Drug6 <chr>, Dose6 <dbl>
a data.table approach
library(data.table)
DT <- fread("Row PID Date Drug1 Dose1 Drug2 Dose2 Drug3 Dose3 Age Place
1 11A 25/10/2021 RPG 12 NAT 34 QRT 5 45 PMk
2 11A 25/10/2021 BET 10 SET 43 BLT 45
3 12B 20/10/2021 ATY 13 LTP 3 CRT 3 56 GTL
4 13A 22/10/2021 GGS 7 GSF 12 ERE 45 45 RKS
5 13A 26/10/2021 BRT 9 ARR 4 GSF 34 46 GLO
6 13A 22/10/2021 DFS 5
7 14B 04/08/2021 GDS 2 TRE 55 HHS 34 25 MTK")
dcast(DT)
DT
# Melt to long format
ans <- melt(DT, id.vars = c("PID", "Date"),
measure.vars = patterns(drug = "^Drug", dose = "^Dose"),
na.rm = TRUE)
# Paste and Collapse, use ; as separator
ans <- ans[, lapply(.SD, paste0, collapse = ";"), by = .(PID, Date)]
# Split string on ;
ans[, paste0("Drug", 1:length(tstrsplit(ans$drug, ";"))) := tstrsplit(drug, ";")]
ans[, paste0("Dose", 1:length(tstrsplit(ans$dose, ";"))) := tstrsplit(dose, ";")]
#join Age + Place data
ans[DT[!is.na(Age), ], `:=`(Age = i.Age, Place = i.Place), on = .(PID, Date)]
ans[, -c("variable", "drug", "dose")]
# PID Date Drug1 Drug2 Drug3 Drug4 Drug5 Drug6 Dose1 Dose2 Dose3 Dose4 Dose5 Dose6 Age Place
# 1: 11A 25/10/2021 RPG BET NAT SET QRT BLT 12 10 34 43 5 45 45 PMk
# 2: 12B 20/10/2021 ATY LTP CRT <NA> <NA> <NA> 13 3 3 <NA> <NA> <NA> 56 GTL
# 3: 13A 22/10/2021 GGS DFS GSF ERE <NA> <NA> 7 5 12 45 <NA> <NA> 45 RKS
# 4: 13A 26/10/2021 BRT ARR GSF <NA> <NA> <NA> 9 4 34 <NA> <NA> <NA> 46 GLO
# 5: 14B 04/08/2021 GDS TRE HHS <NA> <NA> <NA> 2 55 34 <NA> <NA> <NA> 25 MTK
Another answer to the festival.
Reading data from this page
require(rvest)
require(tidyverse)
d = read_html("https://stackoverflow.com/q/69787018/694915") %>%
html_nodes("table") %>%
html_table(fill = TRUE)
List of dose per PID and DATE
# primera tabla
d[[1]] -> df
df %>%
pivot_longer(
cols = starts_with("Drug"),
values_to = "Drug"
) %>%
select( !name ) %>%
pivot_longer(
cols = starts_with("Dose"),
values_to = "Dose"
) %>%
select( !name ) %>%
drop_na() %>%
pivot_wider(
names_from = Drug,
values_from = Dose ,
values_fill = list(0)
) -> dose
Variable dose contains this data
(https://i.stack.imgur.com/lc3iN.png)
Not that elegant as previous ones, but is an idea to see the whole treatment per PID.

Spread and Gather table return duplicated rows with NA values

I have a table with categories and sub categories encoded in this format of columns name:
Date| Admissions__0 |Attendance__0 |Tri_1__0|Tri_2__0|...
Tri_1__1|Tri_2__1|...|
and I would like to change it to this format of columns using spread and gather function of tidyverse:
Date| Country code| Admissions| Attendance| Tri_1|Tri_2|...
I tried a solution posted but the outcome actually return multiple rows with NA rather than a single row.
My code used:
temp <- data %>% gather(key="columns",value ="dt",-Date)
temp <- temp %>% mutate(category = gsub(".*__","",columns)) %>% mutate(columns = gsub("__\\d","",columns))
temp %>% mutate(row = row_number()) %>% spread(key="columns",value="dt")
And my results is:
Date country_code row admissions attendance Tri_1 Tri_2 Tri_3 Tri_4 Tri_5
<chr> <chr> <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 01-APR-2014 0 275 NA 209 NA NA NA NA NA
2 01-APR-2014 0 640 84 NA NA NA NA NA NA
3 01-APR-2014 0 1005 NA NA 5 NA NA NA NA
4 01-APR-2014 0 1370 NA NA NA 33 NA NA NA
5 01-APR-2014 0 1735 NA NA NA NA 62 NA NA
6 01-APR-2014 0 2100 NA NA NA NA NA 80 NA
7 01-APR-2014 0 2465 NA NA NA NA NA NA 29
8 01-APR-2014 1 2830 NA 138 NA NA NA NA NA
9 01-APR-2014 1 3195 66 NA NA NA NA NA NA
10 01-APR-2014 1 3560 NA NA N/A NA NA NA NA
My expected results:
Date country_code row admissions attendance Tri_1 Tri_2 Tri_3 Tri_4 Tri_5
<chr> <chr> <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 01-APR-2014 0 275 84 209 5 33 62 80 29
8 01-APR-2014 1 2830 66 138 66 ... ... ... ...
We can do a summarise_at coalesce to remove the NA elements after the spread
library(tidyverse)
data %>%
gather(key = "columns", val = "dt", -Date, na.rm = TRUE) %>%
mutate(category = gsub(".*__","",columns)) %>%
mutate(columns = gsub("__\\d","",columns)) %>%
group_by(Date, dt, columns, category) %>%
mutate(rn = row_number()) %>%
spread(columns, dt) %>%
select(-V1) %>%
summarise_at(vars(Admissions:Tri_5),list(~ coalesce(!!! .))) # %>%
# filter if needed
#filter_at(vars(Admissions:Tri_5), all_vars(!is.na(.)))

Cleaning a data.frame in a semi-reshape/semi-aggregate fashion

First time posting something here, forgive any missteps in my question.
In my example below I've got a data.frame where the unique identifier is the tripID with the name of the vessel, the species code, and a catch metric.
> testFrame1 <- data.frame('tripID' = c(1,1,2,2,3,4,5),
'name' = c('SS Anne','SS Anne', 'HMS Endurance', 'HMS Endurance','Salty Hippo', 'Seagallop', 'Borealis'),
'SPP' = c(101,201,101,201,102,102,103),
'kept' = c(12, 22, 14, 24, 16, 18, 10))
> testFrame1
tripID name SPP kept
1 1 SS Anne 101 12
2 1 SS Anne 201 22
3 2 HMS Endurance 101 14
4 2 HMS Endurance 201 24
5 3 Salty Hippo 102 16
6 4 Seagallop 102 18
7 5 Borealis 103 10
I need a way to basically condense the data.frame so that all there is only one row per tripID as shown below.
> testFrame1
tripID name SPP kept SPP.1 kept.1
1 1 SS Anne 101 12 201 22
2 2 HMS Endurance 101 14 201 24
3 3 Salty Hippo 102 16 NA NA
4 4 Seagallop 102 18 NA NA
5 5 Borealis 103 10 NA NA
I've looked into tidyr and reshape but neither of those are can deliver quite what I'm asking for. Is there anything out there that does this quasi-reshaping?
Here are two alternatives using base::reshape and data.table::dcast:
1) base R
reshape(transform(testFrame1,
timevar = ave(tripID, tripID, FUN = seq_along)),
idvar = cbind("tripID", "name"),
timevar = "timevar",
direction = "wide")
# tripID name SPP.1 kept.1 SPP.2 kept.2
#1 1 SS Anne 101 12 201 22
#3 2 HMS Endurance 101 14 201 24
#5 3 Salty Hippo 102 16 NA NA
#6 4 Seagallop 102 18 NA NA
#7 5 Borealis 103 10 NA NA
2) data.table
library(data.table)
setDT(testFrame1)
dcast(testFrame1, tripID + name ~ rowid(tripID), value.var = c("SPP", "kept"))
# tripID name SPP_1 SPP_2 kept_1 kept_2
#1: 1 SS Anne 101 201 12 22
#2: 2 HMS Endurance 101 201 14 24
#3: 3 Salty Hippo 102 NA 16 NA
#4: 4 Seagallop 102 NA 18 NA
#5: 5 Borealis 103 NA 10 NA
Great reproducible post considering it's your first. Here's a way to do it with dplyr and tidyr -
testFrame1 %>%
group_by(tripID, name) %>%
summarise(
SPP = toString(SPP),
kept = toString(kept)
) %>%
ungroup() %>%
separate("SPP", into = c("SPP", "SPP.1"), sep = ", ", extra = "drop", fill = "right") %>%
separate("kept", into = c("kept", "kept.1"), sep = ", ", extra = "drop", fill = "right")
# A tibble: 5 x 6
tripID name SPP SPP.1 kept kept.1
<dbl> <chr> <chr> <chr> <chr> <chr>
1 1.00 SS Anne 101 201 12 22
2 2.00 HMS Endurance 101 201 14 24
3 3.00 Salty Hippo 102 <NA> 16 <NA>
4 4.00 Seagallop 102 <NA> 18 <NA>
5 5.00 Borealis 103 <NA> 10 <NA>

Resources