Rolling paste strings across columns - r

I have this type of data:
df <- data.frame(
w1 = c("A", "B", "C", "E", "F", "G"),
w2 = c("B", "G", "C", "D", "E", "V"),
w3 = c("D", "S", "O", "F", NA, "N"),
w4 = c("E", "U", NA, "T", NA, NA),
w5 = c("C", NA, NA, NA, NA, NA)
)
I need to iterate through column pairs to rolling-paste the separate strings into bigrams. Note that in the actual data the strings are of variable character length and character type.
I've tried this but it fails:
df[, paste0("bigr_", 1:4, "_", 2:5)] <- lapply(df[, 1:5],
function(x) paste(x[i], x[i+1], sep = " "))
The expected output is:
w1 w2 w3 w4 w5 bigr_1_2 bigr_2_3 bigr_3_4 bigr_4_5
1 A B D E C A B B D D E E C
2 B G S U <NA> B G G S S U <NA>
3 C C O <NA> <NA> C C C O <NA> <NA>
4 E D F T <NA> E D D F F T <NA>
5 F E <NA> <NA> <NA> F E <NA> <NA> <NA>
6 G V N <NA> <NA> G V V N <NA> <NA>
I'd be most interested in a dplyr solution but am open and grateful for other solutions as well.

As you said you're most interested in a dplyr solution, this can be achieved using mutate() and across(). You can alter the function applied to each column if this doesn't achieve the exact desired output.
df %>%
mutate(
across(
# For the first four columns (i.e. has number 1-4 in column name)
matches("[1-4]"),
# Apply custom function
function(col) {
# Paste together
paste(
col, # the data in the current column
cur_data()[[which(names(cur_data()) == cur_column())+1]], # and the data in the next column along
sep = " "
)
},
.names = "{gsub(pattern = 'w', replacement = 'bigr_', {col})}" # alter name of new cols (replace 'w' with 'bigr_')
)
) %>%
# EDIT: added to rename columns to match desired output
rename_with(.cols = matches("bigr"),
.fn = function(colname) {
paste0(colname, "_", as.numeric(gsub(pattern = "bigr_", replacement = "", colname))+1)
})

df <- data.frame(
w1 = c("A", "B", "C", "E", "F", "G"),
w2 = c("B", "G", "C", "D", "E", "V"),
w3 = c("D", "S", "O", "F", NA, "N"),
w4 = c("E", "U", NA, "T", NA, NA),
w5 = c("C", NA, NA, NA, NA, NA)
)
library(tidyverse)
library(janitor)
df %>%
mutate(rn = row_number()) %>%
pivot_longer(-rn, values_drop_na = TRUE) %>%
group_by(rn) %>%
mutate(bigr = paste0(value, "_", lead(value))) %>%
mutate(bigr = if_else(str_detect(bigr, "_NA"), NA_character_, bigr)) %>%
pivot_wider(rn, names_from = c(name), values_from = c(value, bigr)) %>%
remove_empty("cols") %>%
ungroup() %>%
select(-rn) %>%
rename_with(~str_remove(string = ., "value_")) %>%
rename_with(~str_replace(., "(_w)(\\d+)", "_\\2"))
#> # A tibble: 6 × 9
#> w1 w2 w3 w4 w5 bigr_1 bigr_2 bigr_3 bigr_4
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 A B D E C A_B B_D D_E E_C
#> 2 B G S U <NA> B_G G_S S_U <NA>
#> 3 C C O <NA> <NA> C_C C_O <NA> <NA>
#> 4 E D F T <NA> E_D D_F F_T <NA>
#> 5 F E <NA> <NA> <NA> F_E <NA> <NA> <NA>
#> 6 G V N <NA> <NA> G_V V_N <NA> <NA>
Created on 2022-04-26 by the reprex package (v2.0.1)

As long as you don't have a string that is NA, you could try:
df %>%
mutate(across(-1,
~ paste(get(paste0("w", match(cur_column(), names(cur_data())) - 1)), .),
.names = 'bigr_{paste0("w", match(.col, names(cur_data())) - 1)}_{.col}')) %>%
mutate(across(starts_with("bigr"),
~ if_else(str_count(., "NA") != 0, NA_character_, .)))
w1 w2 w3 w4 w5 bigr_w1_w2 bigr_w2_w3 bigr_w3_w4 bigr_w4_w5
1 A B D E C A B B D D E E C
2 B G S U <NA> B G G S S U <NA>
3 C C O <NA> <NA> C C C O <NA> <NA>
4 E D F T <NA> E D D F F T <NA>
5 F E <NA> <NA> <NA> F E <NA> <NA> <NA>
6 G V N <NA> <NA> G V V N <NA> <NA>

As you are open to non-dplyr solutions, we can do it in base R by modifying your original code:
df[, paste0("bigr_", 1:4, "_", 2:5)] <- mapply(paste, df[, 1:4], df[, 2:5])
# as NA is coerced to character, we need to find those positions and correct
x <- which(is.na(df[, 1:4]) | is.na(df[, 2:5]), arr.ind = TRUE)
x[, 2] <- x[, 2] + 5
df[x] <- NA
df
# w1 w2 w3 w4 w5 bigr_1_2 bigr_2_3 bigr_3_4 bigr_4_5
# 1 A B D E C A B B D D E E C
# 2 B G S U <NA> B G G S S U <NA>
# 3 C C O <NA> <NA> C C C O <NA> <NA>
# 4 E D F T <NA> E D D F F T <NA>
# 5 F E <NA> <NA> <NA> F E <NA> <NA> <NA>
# 6 G V N <NA> <NA> G V V N <NA> <NA>

We can use the tidytext package as follows:
df %>%
rowid_to_column() %>%
unite(col, -rowid, sep = ' ') %>%
tidytext::unnest_ngrams(value, 'col', 2, to_lower = FALSE) %>%
group_by(rowid) %>%
mutate(name = row_number()) %>%
pivot_wider(rowid, names_prefix = 'bgram_')
# A tibble: 6 x 5
# Groups: rowid [6]
rowid bgram_1 bgram_2 bgram_3 bgram_4
<int> <chr> <chr> <chr> <chr>
1 1 A B B D D E E C
2 2 B G G S S U U NA
3 3 C C C O O NA NA NA
4 4 E D D F F T T NA
5 5 F E E NA NA NA NA NA
6 6 G V V N N NA NA NA

using data.table
df[, (paste("bigr", 1:4, 2:5, sep = "_")) := Map(function(x, y) ifelse(is.na(x) | is.na(y), NA, paste(x, y)), .SD[, 1:4], .SD[, 2:5])]

Related

update names based on columns

I would like update the names based on two columns
My example has 3 originial columns
df <- data.frame(name1 = c("a", "a", "a", "a", 'a', NA, NA, NA),
name2 = c("b", "b", "b", "b", "c", NA, NA, NA),
name3 = c("b", "b", "b", "b", "c", "a", "a", "a"))
df
name1 name2 name3
1 a b b
2 a b b
3 a b b
4 a b b
5 a c c
6 <NA> <NA> a
7 <NA> <NA> a
8 <NA> <NA> a
I would like to update column name3 (or even create a new column) saying that if name1 == a, and name2 == NA, then the a character in name3 will be replaced by b in column name2.
My desired output something like
name1 name2 name3
1 a b b
2 a b b
3 a b b
4 a b b
5 a c c
6 <NA> <NA> b
7 <NA> <NA> b
8 <NA> <NA> b
So far, i am using this df %>% mutate(name3 = ifelse(name1 == "a" & is.na(name2), "b", name3)), but now NA appeared. Any suggestions for this?
Base R
df$name3 <- ifelse(any(df$name1 == "a") & is.na(df$name2), "b", df$name3)
dplyr
library(dplyr)
df %>%
mutate(name3 = case_when(
any(name1 == "a") & is.na(name2) ~ "b",
TRUE ~ name3
))
# name1 name2 name3
#1 a b b
#2 a b b
#3 a b b
#4 a b b
#5 a c c
#6 <NA> <NA> b
#7 <NA> <NA> b
#8 <NA> <NA> b
We can replace == with %in% to eliminate the NAs, because R evaluates NA %in% x to FALSE, but NA==x to NA
df %>% mutate(name3 = ifelse(name1 %in% 'a' & is.na(name2), 'b', name3))
We could use a case_when or ifelse statement:
library(dplyr)
df %>%
mutate(name3 = case_when(any(name1 %in% "a") &
is.na(name2) ~ "b",
TRUE ~ name3))
or:
df %>%
mutate(name3 = ifelse(any(name1 %in% "a") &
is.na(name2), "b", name3))
name1 name2 name3
1 a b b
2 a b b
3 a b b
4 a b b
5 a c c
6 <NA> <NA> b
7 <NA> <NA> b
8 <NA> <NA> b

Wide to long, combining columns in pairs but keeping ID column - R

I have a dataframe of the following type
ID case1 case2 case3 case4
1 A B C D
2 B A
3 E F
4 G C A
5 T
I need to change its format, to a long shape, similar as the below:
ID col1 col2
1 A B
1 A C
1 A D
1 B C
1 B D
1 C D
2 B A
3 E F
4 G C
4 G A
4 C A
5 T
As you can see, I need to maintain the ID and ignore empty columns. There are some cases like T that need to remain in the dataset, but without a col2.
I am honestly not sure how to approach this, so that is why there are no examples of what I have tried.
You can get the data in long format and create all combination of values for each ID if the number of rows is greater than 1 in that ID.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -ID, values_drop_na = TRUE) %>%
group_by(ID) %>%
summarise(value = if(n() > 1) list(setNames(as.data.frame(t(combn(value, 2))),
c('col1', 'col2')))
else list(data.frame(col1 = value[1], col2 = NA_character_))) %>%
unnest(value)
# A tibble: 12 x 3
# ID col1 col2
# <int> <chr> <chr>
# 1 1 A B
# 2 1 A C
# 3 1 A D
# 4 1 B C
# 5 1 B D
# 6 1 C D
# 7 2 B A
# 8 3 E F
# 9 4 G C
#10 4 G A
#11 4 C A
#12 5 T NA
data
df <- structure(list(ID = 1:5, case1 = c("A", "B", "E", "G", "T"),
case2 = c("B", "A", "F", "C", NA), case3 = c("C", NA, NA,
"A", NA), case4 = c("D", NA, NA, NA, NA)),
class = "data.frame", row.names = c(NA, -5L))

Transform df into edges df for collaboration network

I have this df, which contains information on collaboration of articles:
author author2 author3 author4
1 A D E F
2 B G
3 C H F
I need to create an edges dataframe, which contains the relationship between the authors, like this:
from to
1 A D
2 A E
3 A F
4 B G
5 C H
6 C F
7 D E
8 D F
9 E F
11 H F
any ideas how to do it?
We can gather each column against the remaining columns i.e. to the left of that column and then binds all.
library(tidyverse)
map_dfr(names(df)[-length(df)], ~select(df,.x:ncol(df)) %>% gather( k,to,-.x) %>%
arrange(!!ensym(.x)) %>% select(-k) %>% filter(to!='') %>%
rename(form=starts_with('author')))
form to
1 A D
2 A E
3 A F
4 B G
5 C H
6 C F
7 D E
8 D F
9 H F
10 E F
Data
df <- structure(list(author = c("A", "B", "C"), author2 = c("D", "G",
"H"), author3 = c("E", "", "F"), author4 = c("F","", "")), class = "data.frame", row.names = c("1",
"2", "3"))
You could apply combn row-wise inside a function, no need for packages.
edges <- setNames(as.data.frame(do.call(rbind, lapply(seq(nrow(d)), function(x)
matrix(unlist(t(combn(na.omit(unlist(d[x, ])), 2))), ncol=2)))), c("from", "to"))
edges
# from to
# 1 A D
# 2 A E
# 3 A F
# 4 D E
# 5 D F
# 6 E F
# 7 B G
# 8 C H
# 9 C F
# 10 H F
Or, using igraph package as #akrun suggested.
library(igraph)
edges <- do.call(rbind, apply(d, 1, function(x)
as_data_frame(graph_from_data_frame(t(combn(na.omit(x), 2))))))
edges
# from to
# 1 A D
# 2 A E
# 3 A F
# 4 D E
# 5 D F
# 6 E F
# 7 B G
# 8 C H
# 9 C F
# 10 H F
Data
d <- structure(list(author = c("A", "B", "C"), author2 = c("D", "G",
"H"), author3 = c("E", NA, "F"), author4 = c("F", NA, NA)), row.names = c(NA,
-3L), class = "data.frame")

How to return first element of a group excluding NA's when non-NA values exist

I have a data frame named df which looks like.
x y
A NA
B d1
L d2
F c1
L s2
A c4
B NA
B NA
A c1
F a5
G NA
H NA
I want to group by x and fill in NA values with the first non-NA element in that group if possible. Note that some groups will not have a non-NA element so returning NA is fine for that case.
df %>% group_by(x) %>% mutate(new_y = first(y))
returns the first value including NA's even when non-NA values exist for that group.
We can use replace
df %>%
group_by(x) %>%
mutate(y = replace(y, is.na(y), y[!is.na(y)][1]))
# x y
# <chr> <chr>
#1 A c4
#2 B d1
#3 L d2
#4 F c1
#5 L s2
#6 A c4
#7 B d1
#8 B d1
#9 A c1
#10 F a5
#11 G <NA>
#12 H <NA>
Or we can do a join in data.table
library(data.table)
library(tidyr)
setDT(df)[df[order(x, is.na(y)), .SD[1L], x], y := coalesce(y, i.y),on = .(x)]
df
# x y
# 1: A c4
# 2: B d1
# 3: L d2
# 4: F c1
# 5: L s2
# 6: A c4
# 7: B d1
# 8: B d1
# 9: A c1
#10: F a5
#11: G NA
#12: H NA
Or using base R
df$y <- with(df, ave(y, x, FUN = function(x) replace(x, is.na(x), x[!is.na(x)][1])))
data
df <- structure(list(x = c("A", "B", "L", "F", "L", "A", "B", "B",
"A", "F", "G", "H"), y = c(NA, "d1", "d2", "c1", "s2", "c4",
NA, NA, "c1", "a5", NA, NA)), .Names = c("x", "y"), class = "data.frame",
row.names = c(NA, -12L))

Reshaping datas in a specific form

I've datas as follows, it is a but in reality i've few experiment, it is simplified dataset:
DF=structure(list(theoric = c("E", "E", "F", "F", "F"), observed = c("E",
"E", "F", "F", "E"), experiment = c("RO(2)", "RO(2)", "RO(2)", "RO(2)",
"RO(2)")), .Names = c("theoric", "observed", "experiment"), row.names = 2:6, class = "data.frame")
Now my datas has the following form:
theoric observed experiment
2 E E RO(2)
3 E E RO(2)
4 F F RO(2)
5 F F RO(2)
6 F E RO(2)
Adn I want it to be reshaped as follows :
2 3 4 5 6
RO(2) theoric E E F F F
RO(2) observed E E F F E
What is the easiest way to do it ? I really have no idea how to do this. I tried
meltR <- melt(DF, id="experiment")
But i'm lost all correspondance between theoric and observed. Thanks a lot
EDIT : full dataset:
DF=structure(list(theoric = c("E", "E", "F", "F", "F", "E", "F",
"F", "F", "F", "F", "E", "E", "E", "E"), observed = c("E", "E",
"F", "F", "E", "F", "F", "F", "F", "F", "F", "E", "E", "E", "F"
), experiment = c("RO", "RO", "RO", "RO", "RO", "MO", "MO", "MO",
"MO", "MO", "MO", "EL", "EL", "EL", "EL")), .Names = c("theoric",
"observed", "experiment"), row.names = c(2L, 3L, 4L, 5L, 6L,
24L, 25L, 26L, 27L, 28L, 29L, 21L, 22L, 23L, 13L), class = "data.frame")
output:
col2 col1.2 col1.3 col1.4 col1.5 col1.6 col1.24 col1.25 col1.26
1 RO theoric E E F F F <NA> <NA> <NA>
6 MO theoric <NA> <NA> <NA> <NA> <NA> E F F
12 EL theoric <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
16 RO observed E E F F E <NA> <NA> <NA>
21 MO observed <NA> <NA> <NA> <NA> <NA> F F F
27 EL observed <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
col1.27 col1.28 col1.29 col1.21 col1.22 col1.23 col1.13
1 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
6 F F F <NA> <NA> <NA> <NA>
12 <NA> <NA> <NA> E E E E
16 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
21 F F F <NA> <NA> <NA> <NA>
27 <NA> <NA> <NA> E E E F
EDIT 2 : Add EL ouput
RO theoric E E F F F
RO observed E E F F E
MO theoric E F F F F
MO observed F F F F F
EL theoric E E E E
EL observed E E E F
Based on the expected output, we may need to create a column with row.names. Create a new dataset ('df2'), by unlisting the first two columns, replicating the 'experiment' column, and a rownames column. Then use reshape from base R to convert the 'long' format to 'wide'.
df2 <- data.frame(col1 = unlist(DF[1:2], use.names=FALSE),
col2 = paste( rep(DF$experiment, 2),
rep(colnames(DF)[1:2], each = nrow(DF))), col3 = rep(row.names(DF), 2))
reshape(df2, idvar = "col2", direction="wide", timevar = "col3")
# col2 col1.2 col1.3 col1.4 col1.5 col1.6
#1 RO(2) theoric E E F F F
#6 RO(2) observed E E F F E
Or using melt/dcast from data.table. Convert the 'data.frame' to 'data.table' keeping the row names (setDT(DF, keep.row.names = TRUE)), melt it to 'long' format, paste the 'experiment' and 'variable' column, and then dcast from 'long' to 'wide' format.
library(data.table)
dcast(melt(setDT(DF, keep.rownames = TRUE), id.var = c("rn", "experiment"))[,
experiment := paste(experiment, variable)], experiment~rn, value.var = "value")
# experiment 2 3 4 5 6
#1: RO(2) observed E E F F E
#2: RO(2) theoric E E F F F
Update
Using the new dataset,
library(data.table)#v1.9.7+
dcast(melt(setDT(DF), id.var = "experiment"), paste(experiment,
variable)~rowid(experiment, variable), value.var="value", fill="")
# experiment 1 2 3 4 5 6
#1: EL observed E E E F
#2: EL theoric E E E E
#3: MO observed F F F F F F
#4: MO theoric E F F F F F
#5: RO observed E E F F E
#6: RO theoric E E F F F
You could also do the following:
require(tidyverse)
DF %>%
gather(type, val, theoric, observed) %>%
unite(experiment, experiment, type, sep=" ") %>%
group_by(experiment) %>%
mutate(experiment_number = 1:n()) %>%
spread(experiment_number, val, fill="")
Which gives you:
experiment `1` `2` `3` `4` `5` `6`
* <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 EL observed E E E F
2 EL theoric E E E E
3 MO observed F F F F F F
4 MO theoric E F F F F F
5 RO observed E E F F E
6 RO theoric E E F F F

Resources