R: Reshaping data as multiple columns into rows - r

I have a df which includes multiple columns, which you could find my templete below. I would like to reshape as columns into rows in R. I am sure it is possible with tidyr::gather() function but I can not manage it.
If someone could help me I would be glad!
Best wishes
# Df I have
A1 A2 A3 A4 B1 B2 B3 B4 C1 C2 C3 C4 D1 D2 D3 D4
X1 X2 X3 X4 a b c d e f g h i j k l
Y1 Y2 Y3 Y4 m n o p
Z1 Z2 Z3 Z4 r s t u w v y z
# Df I would like to reshape
Col1 Col2 Col3 Col4
X1 X2 X3 X4 a b c d
X1 X2 X3 X4 e f g h
X1 X2 X3 X4 i j k l
Y1 Y2 Y3 Y4 m n o p
Z1 Z2 Z3 Z4 r s t u
Z1 Z2 Z3 Z4 w v y z

We could also do this with a single pivot_longer
library(dplyr)
library(tidyr)
library(stringr)
df %>%
pivot_longer(cols = -id, names_to = c("grp", ".value"),
names_sep="(?<=[A-Z])(?=[0-9])", values_drop_na = TRUE) %>%
select(-grp) %>%
rename_at(-1, ~ str_c('Col', .))
# A tibble: 7 x 5
# id Col1 Col2 Col3 Col4
# <int> <chr> <chr> <chr> <chr>
#1 1 a b c d
#2 1 e f g h
#3 1 i j k l
#4 2 m n o p
#5 2 q <NA> <NA> <NA>
#6 3 r s t u
#7 3 w v y z
data
df <- structure(list(id = 1:3, A1 = c("a", "m", "r"), A2 = c("b", "n",
"s"), A3 = c("c", "o", "t"), A4 = c("d", "p", "u"), B1 = c("e",
"q", "w"), B2 = c("f", NA, "v"), B3 = c("g", NA, "y"), B4 = c("h",
NA, "z"), C1 = c("i", NA, NA), C2 = c("j", NA, NA), C3 = c("k",
NA, NA), C4 = c("l", NA, NA), D1 = c(NA, NA, NA), D2 = c(NA,
NA, NA), D3 = c(NA, NA, NA), D4 = c(NA, NA, NA)), class = "data.frame",
row.names = c("1",
"2", "3"))

I bet there are more elegant solutions, but this one uses tidyr and dplyr:
Suppose your data looks like
> df
# A tibble: 3 x 17
id A1 A2 A3 A4 B1 B2 B3 B4 C1 C2 C3 C4 D1 D2 D3 D4
<dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 a b c d e f g h i j k l NA NA NA NA
2 2 m n o p q NA NA NA NA NA NA NA NA NA NA NA
3 3 r s t u w v y z NA NA NA NA NA NA NA NA
I replaced your X1 X2 X3 X4, ... by an indexing column and I added on q in column B1.
Using
df %>%
pivot_longer(cols=matches("\\d$"),
names_to = c("set"),
names_pattern = ".(.)") %>%
pivot_wider(names_from="set",
names_prefix="Col",
values_fn = list) %>%
unnest(matches("\\d$")) %>%
rowwise() %>%
filter(sum(is.na(c_across(matches("\\d$")))) != ncol(.) - 1) # -1 because of the indexing column
returns
# A tibble: 7 x 5
# Rowwise:
id Col1 Col2 Col3 Col4
<dbl> <chr> <chr> <chr> <chr>
1 1 a b c d
2 1 e f g h
3 1 i j k l
4 2 m n o p
5 2 q NA NA NA
6 3 r s t u
7 3 w v y z

Related

Rolling paste strings across columns

I have this type of data:
df <- data.frame(
w1 = c("A", "B", "C", "E", "F", "G"),
w2 = c("B", "G", "C", "D", "E", "V"),
w3 = c("D", "S", "O", "F", NA, "N"),
w4 = c("E", "U", NA, "T", NA, NA),
w5 = c("C", NA, NA, NA, NA, NA)
)
I need to iterate through column pairs to rolling-paste the separate strings into bigrams. Note that in the actual data the strings are of variable character length and character type.
I've tried this but it fails:
df[, paste0("bigr_", 1:4, "_", 2:5)] <- lapply(df[, 1:5],
function(x) paste(x[i], x[i+1], sep = " "))
The expected output is:
w1 w2 w3 w4 w5 bigr_1_2 bigr_2_3 bigr_3_4 bigr_4_5
1 A B D E C A B B D D E E C
2 B G S U <NA> B G G S S U <NA>
3 C C O <NA> <NA> C C C O <NA> <NA>
4 E D F T <NA> E D D F F T <NA>
5 F E <NA> <NA> <NA> F E <NA> <NA> <NA>
6 G V N <NA> <NA> G V V N <NA> <NA>
I'd be most interested in a dplyr solution but am open and grateful for other solutions as well.
As you said you're most interested in a dplyr solution, this can be achieved using mutate() and across(). You can alter the function applied to each column if this doesn't achieve the exact desired output.
df %>%
mutate(
across(
# For the first four columns (i.e. has number 1-4 in column name)
matches("[1-4]"),
# Apply custom function
function(col) {
# Paste together
paste(
col, # the data in the current column
cur_data()[[which(names(cur_data()) == cur_column())+1]], # and the data in the next column along
sep = " "
)
},
.names = "{gsub(pattern = 'w', replacement = 'bigr_', {col})}" # alter name of new cols (replace 'w' with 'bigr_')
)
) %>%
# EDIT: added to rename columns to match desired output
rename_with(.cols = matches("bigr"),
.fn = function(colname) {
paste0(colname, "_", as.numeric(gsub(pattern = "bigr_", replacement = "", colname))+1)
})
df <- data.frame(
w1 = c("A", "B", "C", "E", "F", "G"),
w2 = c("B", "G", "C", "D", "E", "V"),
w3 = c("D", "S", "O", "F", NA, "N"),
w4 = c("E", "U", NA, "T", NA, NA),
w5 = c("C", NA, NA, NA, NA, NA)
)
library(tidyverse)
library(janitor)
df %>%
mutate(rn = row_number()) %>%
pivot_longer(-rn, values_drop_na = TRUE) %>%
group_by(rn) %>%
mutate(bigr = paste0(value, "_", lead(value))) %>%
mutate(bigr = if_else(str_detect(bigr, "_NA"), NA_character_, bigr)) %>%
pivot_wider(rn, names_from = c(name), values_from = c(value, bigr)) %>%
remove_empty("cols") %>%
ungroup() %>%
select(-rn) %>%
rename_with(~str_remove(string = ., "value_")) %>%
rename_with(~str_replace(., "(_w)(\\d+)", "_\\2"))
#> # A tibble: 6 × 9
#> w1 w2 w3 w4 w5 bigr_1 bigr_2 bigr_3 bigr_4
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 A B D E C A_B B_D D_E E_C
#> 2 B G S U <NA> B_G G_S S_U <NA>
#> 3 C C O <NA> <NA> C_C C_O <NA> <NA>
#> 4 E D F T <NA> E_D D_F F_T <NA>
#> 5 F E <NA> <NA> <NA> F_E <NA> <NA> <NA>
#> 6 G V N <NA> <NA> G_V V_N <NA> <NA>
Created on 2022-04-26 by the reprex package (v2.0.1)
As long as you don't have a string that is NA, you could try:
df %>%
mutate(across(-1,
~ paste(get(paste0("w", match(cur_column(), names(cur_data())) - 1)), .),
.names = 'bigr_{paste0("w", match(.col, names(cur_data())) - 1)}_{.col}')) %>%
mutate(across(starts_with("bigr"),
~ if_else(str_count(., "NA") != 0, NA_character_, .)))
w1 w2 w3 w4 w5 bigr_w1_w2 bigr_w2_w3 bigr_w3_w4 bigr_w4_w5
1 A B D E C A B B D D E E C
2 B G S U <NA> B G G S S U <NA>
3 C C O <NA> <NA> C C C O <NA> <NA>
4 E D F T <NA> E D D F F T <NA>
5 F E <NA> <NA> <NA> F E <NA> <NA> <NA>
6 G V N <NA> <NA> G V V N <NA> <NA>
As you are open to non-dplyr solutions, we can do it in base R by modifying your original code:
df[, paste0("bigr_", 1:4, "_", 2:5)] <- mapply(paste, df[, 1:4], df[, 2:5])
# as NA is coerced to character, we need to find those positions and correct
x <- which(is.na(df[, 1:4]) | is.na(df[, 2:5]), arr.ind = TRUE)
x[, 2] <- x[, 2] + 5
df[x] <- NA
df
# w1 w2 w3 w4 w5 bigr_1_2 bigr_2_3 bigr_3_4 bigr_4_5
# 1 A B D E C A B B D D E E C
# 2 B G S U <NA> B G G S S U <NA>
# 3 C C O <NA> <NA> C C C O <NA> <NA>
# 4 E D F T <NA> E D D F F T <NA>
# 5 F E <NA> <NA> <NA> F E <NA> <NA> <NA>
# 6 G V N <NA> <NA> G V V N <NA> <NA>
We can use the tidytext package as follows:
df %>%
rowid_to_column() %>%
unite(col, -rowid, sep = ' ') %>%
tidytext::unnest_ngrams(value, 'col', 2, to_lower = FALSE) %>%
group_by(rowid) %>%
mutate(name = row_number()) %>%
pivot_wider(rowid, names_prefix = 'bgram_')
# A tibble: 6 x 5
# Groups: rowid [6]
rowid bgram_1 bgram_2 bgram_3 bgram_4
<int> <chr> <chr> <chr> <chr>
1 1 A B B D D E E C
2 2 B G G S S U U NA
3 3 C C C O O NA NA NA
4 4 E D D F F T T NA
5 5 F E E NA NA NA NA NA
6 6 G V V N N NA NA NA
using data.table
df[, (paste("bigr", 1:4, 2:5, sep = "_")) := Map(function(x, y) ifelse(is.na(x) | is.na(y), NA, paste(x, y)), .SD[, 1:4], .SD[, 2:5])]

How collect members of a column based on the value of a specific member in that column in R

In the following data frame, I want to collect members of B1, where their value in B2 is equal to or more than the value of "b" in B2. And then after this new information, count how many times each of the B1 members occurred.
dataframe:
ID B1 B2
z1 a 2.5
z1 b 1.7
z1 c 170
z1 c 9
z1 d 3
y2 a 0
y2 b 21
y2 c 15
y2 c 101
y2 d 30
y2 d 3
y2 d 15.5
x3 a 30.8
x3 a 54
x3 a 0
x3 b 30.8
x3 c 30.8
x3 d 7
so the result would be:
ID B1 B2
z1 a 2.5
z1 c 170
z1 c 9
z1 d 3
y2 c 101
y2 d 30
x3 a 30.8
x3 a 54
x3 c 30.8
and
ID B1 count
z1 a 1
z1 c 2
z1 d 1
y2 a 0
y2 c 1
y2 d 1
x3 a 2
x3 c 1
x3 d 0
Grouped by 'ID', filter where the 'B2' is greater than or equal to 'B2' where 'B1' is 'b' as well as create another condition where 'B1' is not equal to 'b'
library(dplyr)
out1 <- df1 %>%
group_by(ID) %>%
filter(any(B1 == "b") & B2 >= min(B2[B1 == "b"]), B1 != 'b')
-output
> out1
# A tibble: 9 × 3
# Groups: ID [3]
ID B1 B2
<chr> <chr> <dbl>
1 z1 a 2.5
2 z1 c 170
3 z1 c 9
4 z1 d 3
5 y2 c 101
6 y2 d 30
7 x3 a 30.8
8 x3 a 54
9 x3 c 30.8
The second output will be do a group by with summarise to get the number of rows, and then fill the missing combinations with complete
library(tidyr)
out1 %>%
group_by(B1, .add = TRUE) %>%
summarise(count = n(), .groups = "drop_last") %>%
complete(B1 = unique(.$B1), fill = list(count = 0)) %>%
ungroup
# A tibble: 9 × 3
ID B1 count
<chr> <chr> <int>
1 x3 a 2
2 x3 c 1
3 x3 d 0
4 y2 a 0
5 y2 c 1
6 y2 d 1
7 z1 a 1
8 z1 c 2
9 z1 d 1
data
df1 <- structure(list(ID = c("z1", "z1", "z1", "z1", "z1", "y2", "y2",
"y2", "y2", "y2", "y2", "y2", "x3", "x3", "x3", "x3", "x3", "x3"
), B1 = c("a", "b", "c", "c", "d", "a", "b", "c", "c", "d", "d",
"d", "a", "a", "a", "b", "c", "d"), B2 = c(2.5, 1.7, 170, 9,
3, 0, 21, 15, 101, 30, 3, 15.5, 30.8, 54, 0, 30.8, 30.8, 7)),
class = "data.frame", row.names = c(NA,
-18L))
Using tidyverse:
library(tidyverse)
df %>%
group_by(ID) %>%
filter(B2 > B2[B1 == "b"]) %>%
group_by(ID, B1) %>%
count(name = "count") %>%
as.data.frame()
#> ID B1 count
#> 1 x3 a 1
#> 2 y2 c 1
#> 3 y2 d 1
#> 4 z1 a 1
#> 5 z1 c 2
#> 6 z1 d 1
Created on 2022-04-26 by the reprex package (v2.0.1)

R: mutate columns and place before specific columns and name them based on these specific columns

Suppose the following data structure:
structure(list(`1.a` = c("a", NA, "a"), `1.b` = c("b", "b", NA
), `2` = c("ba", "ba", "ab"), `3.a` = c("a", "a", NA), `3.b` = c("b",
NA, "b")), row.names = c(NA, -3L), class = c("tbl_df", "tbl",
"data.frame"))
# A tibble: 3 x 5
`1.a` `1.b` `2` `3.a` `3.b`
<chr> <chr> <chr> <chr> <chr>
1 a b ba a b
2 NA b ba a NA
3 a NA ab NA b
Now, for the columns with .a etc., I want to create a column named X (the part/number before the dot) in front of the .a columns and paste the cell values together, keeping the "order". Result I want:
# A tibble: 3 x 7
`1` `1.a` `1.b` `2` `3` `3.a` `3.b`
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 ab a b ab ab a b
2 b NA b a a a NA
3 a a NA b b NA b
A base R option
do.call(
cbind,
unname(
lapply(
split.default(df, gsub("\\..*", "", names(df))),
function(x) {
if (length(x) > 1) {
cbind(
setNames(
data.frame(
apply(x, 1, function(v) paste0(na.omit(unlist(v)), collapse = ""))
),
unique(gsub("\\..*", "", names(x)))
),
x
)
} else {
x
}
}
)
)
)
gives
1 1.a 1.b 2 3 3.a 3.b
1 ab a b ba ab a b
2 b <NA> b ba a a <NA>
3 a a <NA> ab b <NA> b

Joining two dataframes by concatenating columns

I have two dataframes with the same structure - both have two ID columns and 25 string data columns. I want to join the two and concatenate the strings in the data columns when the IDs match. So, for example:
df_1:
id_1 id_2 col_1 col2 ... col_25
a1 b1 A A ... <NA>
a1 b2 A <NA> ... A
a2 b1 <NA> <NA> ... A
df_2:
id_1 id_2 col_1 col2 ... col_25
a1 b1 B <NA> ... <NA>
a1 b2 <NA> B ... B
a1 b3 B <NA> ... B
Combined, this should give
df_combined:
id_1 id_2 col_1 col2 ... col_25
a1 b1 A, B A ... <NA>
a1 b2 A B ... A, B
a1 b3 B <NA> ... B
a2 b1 <NA> <NA> ... A
When I try to use join or merge, it repeats everything except the ID columns (so I end up with 50 data columns). Do I need to use something else?
Thanks!
You can do this if you don't have any empty string :
library(dplyr)
bind_rows(df_1,df_2) %>%
group_by(id_1,id_2) %>%
summarize_all(~ paste(na.omit(.x),collapse=", ")) %>%
`[<-`(.=="",value=NA)
with magrittr you can avoid the not so pretty '[<-' and replace it by inset
library(magrittr)
bind_rows(df_1,df_2) %>%
group_by(id_1,id_2) %>%
summarize_all(~ paste(na.omit(.x),collapse=", ")) %>%
inset(.=="",value=NA)
There is an alternative solution using melt() and dcast() to reshape the data:
library(data.table)
rbind(setDT(df_1), setDT(df_2))[
, melt(.SD, measure.var = patterns("col"), na.rm = TRUE)][
, dcast(.SD, id_1 + id_2 ~ variable, toString, fill = NA)]
id_1 id_2 col_1 col2 col_25
1: a1 b1 A, B A NA
2: a1 b2 A B A, B
3: a1 b3 B NA B
4: a2 b1 NA NA A
Data
df_1 <- fread(
"id_1 id_2 col_1 col2 ... col_25
a1 b1 A A ... <NA>
a1 b2 A <NA> ... A
a2 b1 <NA> <NA> ... A",
drop = 5L, na.strings = "<NA>"
)
df_2 <- fread(
"id_1 id_2 col_1 col2 ... col_25
a1 b1 B <NA> ... <NA>
a1 b2 <NA> B ... B
a1 b3 B <NA> ... B",
drop = 5L, na.strings = "<NA>"
)
To elaborate to the idea commented by #zx8754, and using dplyr package,
library(dplyr)
df1 %>%
bind_rows(df2) %>%
mutate_at(vars(-contains('id')), funs(replace(., is.na(.), ''))) %>%
group_by(id_1, id_2) %>%
summarise_all(funs(trimws(paste(., collapse = ' ')))) %>%
mutate_all(funs(replace(., . == '', NA)))
which gives,
# A tibble: 4 x 5
# Groups: id_1 [2]
id_1 id_2 col_1 col2 col_25
<chr> <chr> <chr> <chr> <chr>
1 a1 b1 A B A <NA>
2 a1 b2 A B A B
3 a1 b3 B <NA> B
4 a2 b1 <NA> <NA> A
NOTE:
Above script assumes that your NAs are actual NA (not characters)
Your variables are as.character
DATA
dput(df1)
structure(list(id_1 = c("a1", "a1", "a2"), id_2 = c("b1", "b2",
"b1"), col_1 = c("A", "A", NA), col2 = c("A", NA, NA), col_25 = c(NA,
"A", "A")), .Names = c("id_1", "id_2", "col_1", "col2", "col_25"
), row.names = c(NA, -3L), class = "data.frame")
> dput(df2)
structure(list(id_1 = c("a1", "a1", "a1"), id_2 = c("b1", "b2",
"b3"), col_1 = c("B", NA, "B"), col2 = c(NA, "B", NA), col_25 = c(NA,
"B", "B")), .Names = c("id_1", "id_2", "col_1", "col2", "col_25"
), row.names = c(NA, -3L), class = "data.frame")

How to return first element of a group excluding NA's when non-NA values exist

I have a data frame named df which looks like.
x y
A NA
B d1
L d2
F c1
L s2
A c4
B NA
B NA
A c1
F a5
G NA
H NA
I want to group by x and fill in NA values with the first non-NA element in that group if possible. Note that some groups will not have a non-NA element so returning NA is fine for that case.
df %>% group_by(x) %>% mutate(new_y = first(y))
returns the first value including NA's even when non-NA values exist for that group.
We can use replace
df %>%
group_by(x) %>%
mutate(y = replace(y, is.na(y), y[!is.na(y)][1]))
# x y
# <chr> <chr>
#1 A c4
#2 B d1
#3 L d2
#4 F c1
#5 L s2
#6 A c4
#7 B d1
#8 B d1
#9 A c1
#10 F a5
#11 G <NA>
#12 H <NA>
Or we can do a join in data.table
library(data.table)
library(tidyr)
setDT(df)[df[order(x, is.na(y)), .SD[1L], x], y := coalesce(y, i.y),on = .(x)]
df
# x y
# 1: A c4
# 2: B d1
# 3: L d2
# 4: F c1
# 5: L s2
# 6: A c4
# 7: B d1
# 8: B d1
# 9: A c1
#10: F a5
#11: G NA
#12: H NA
Or using base R
df$y <- with(df, ave(y, x, FUN = function(x) replace(x, is.na(x), x[!is.na(x)][1])))
data
df <- structure(list(x = c("A", "B", "L", "F", "L", "A", "B", "B",
"A", "F", "G", "H"), y = c(NA, "d1", "d2", "c1", "s2", "c4",
NA, NA, "c1", "a5", NA, NA)), .Names = c("x", "y"), class = "data.frame",
row.names = c(NA, -12L))

Resources