Reshaping datas in a specific form - r

I've datas as follows, it is a but in reality i've few experiment, it is simplified dataset:
DF=structure(list(theoric = c("E", "E", "F", "F", "F"), observed = c("E",
"E", "F", "F", "E"), experiment = c("RO(2)", "RO(2)", "RO(2)", "RO(2)",
"RO(2)")), .Names = c("theoric", "observed", "experiment"), row.names = 2:6, class = "data.frame")
Now my datas has the following form:
theoric observed experiment
2 E E RO(2)
3 E E RO(2)
4 F F RO(2)
5 F F RO(2)
6 F E RO(2)
Adn I want it to be reshaped as follows :
2 3 4 5 6
RO(2) theoric E E F F F
RO(2) observed E E F F E
What is the easiest way to do it ? I really have no idea how to do this. I tried
meltR <- melt(DF, id="experiment")
But i'm lost all correspondance between theoric and observed. Thanks a lot
EDIT : full dataset:
DF=structure(list(theoric = c("E", "E", "F", "F", "F", "E", "F",
"F", "F", "F", "F", "E", "E", "E", "E"), observed = c("E", "E",
"F", "F", "E", "F", "F", "F", "F", "F", "F", "E", "E", "E", "F"
), experiment = c("RO", "RO", "RO", "RO", "RO", "MO", "MO", "MO",
"MO", "MO", "MO", "EL", "EL", "EL", "EL")), .Names = c("theoric",
"observed", "experiment"), row.names = c(2L, 3L, 4L, 5L, 6L,
24L, 25L, 26L, 27L, 28L, 29L, 21L, 22L, 23L, 13L), class = "data.frame")
output:
col2 col1.2 col1.3 col1.4 col1.5 col1.6 col1.24 col1.25 col1.26
1 RO theoric E E F F F <NA> <NA> <NA>
6 MO theoric <NA> <NA> <NA> <NA> <NA> E F F
12 EL theoric <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
16 RO observed E E F F E <NA> <NA> <NA>
21 MO observed <NA> <NA> <NA> <NA> <NA> F F F
27 EL observed <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
col1.27 col1.28 col1.29 col1.21 col1.22 col1.23 col1.13
1 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
6 F F F <NA> <NA> <NA> <NA>
12 <NA> <NA> <NA> E E E E
16 <NA> <NA> <NA> <NA> <NA> <NA> <NA>
21 F F F <NA> <NA> <NA> <NA>
27 <NA> <NA> <NA> E E E F
EDIT 2 : Add EL ouput
RO theoric E E F F F
RO observed E E F F E
MO theoric E F F F F
MO observed F F F F F
EL theoric E E E E
EL observed E E E F

Based on the expected output, we may need to create a column with row.names. Create a new dataset ('df2'), by unlisting the first two columns, replicating the 'experiment' column, and a rownames column. Then use reshape from base R to convert the 'long' format to 'wide'.
df2 <- data.frame(col1 = unlist(DF[1:2], use.names=FALSE),
col2 = paste( rep(DF$experiment, 2),
rep(colnames(DF)[1:2], each = nrow(DF))), col3 = rep(row.names(DF), 2))
reshape(df2, idvar = "col2", direction="wide", timevar = "col3")
# col2 col1.2 col1.3 col1.4 col1.5 col1.6
#1 RO(2) theoric E E F F F
#6 RO(2) observed E E F F E
Or using melt/dcast from data.table. Convert the 'data.frame' to 'data.table' keeping the row names (setDT(DF, keep.row.names = TRUE)), melt it to 'long' format, paste the 'experiment' and 'variable' column, and then dcast from 'long' to 'wide' format.
library(data.table)
dcast(melt(setDT(DF, keep.rownames = TRUE), id.var = c("rn", "experiment"))[,
experiment := paste(experiment, variable)], experiment~rn, value.var = "value")
# experiment 2 3 4 5 6
#1: RO(2) observed E E F F E
#2: RO(2) theoric E E F F F
Update
Using the new dataset,
library(data.table)#v1.9.7+
dcast(melt(setDT(DF), id.var = "experiment"), paste(experiment,
variable)~rowid(experiment, variable), value.var="value", fill="")
# experiment 1 2 3 4 5 6
#1: EL observed E E E F
#2: EL theoric E E E E
#3: MO observed F F F F F F
#4: MO theoric E F F F F F
#5: RO observed E E F F E
#6: RO theoric E E F F F

You could also do the following:
require(tidyverse)
DF %>%
gather(type, val, theoric, observed) %>%
unite(experiment, experiment, type, sep=" ") %>%
group_by(experiment) %>%
mutate(experiment_number = 1:n()) %>%
spread(experiment_number, val, fill="")
Which gives you:
experiment `1` `2` `3` `4` `5` `6`
* <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 EL observed E E E F
2 EL theoric E E E E
3 MO observed F F F F F F
4 MO theoric E F F F F F
5 RO observed E E F F E
6 RO theoric E E F F F

Related

Rolling paste strings across columns

I have this type of data:
df <- data.frame(
w1 = c("A", "B", "C", "E", "F", "G"),
w2 = c("B", "G", "C", "D", "E", "V"),
w3 = c("D", "S", "O", "F", NA, "N"),
w4 = c("E", "U", NA, "T", NA, NA),
w5 = c("C", NA, NA, NA, NA, NA)
)
I need to iterate through column pairs to rolling-paste the separate strings into bigrams. Note that in the actual data the strings are of variable character length and character type.
I've tried this but it fails:
df[, paste0("bigr_", 1:4, "_", 2:5)] <- lapply(df[, 1:5],
function(x) paste(x[i], x[i+1], sep = " "))
The expected output is:
w1 w2 w3 w4 w5 bigr_1_2 bigr_2_3 bigr_3_4 bigr_4_5
1 A B D E C A B B D D E E C
2 B G S U <NA> B G G S S U <NA>
3 C C O <NA> <NA> C C C O <NA> <NA>
4 E D F T <NA> E D D F F T <NA>
5 F E <NA> <NA> <NA> F E <NA> <NA> <NA>
6 G V N <NA> <NA> G V V N <NA> <NA>
I'd be most interested in a dplyr solution but am open and grateful for other solutions as well.
As you said you're most interested in a dplyr solution, this can be achieved using mutate() and across(). You can alter the function applied to each column if this doesn't achieve the exact desired output.
df %>%
mutate(
across(
# For the first four columns (i.e. has number 1-4 in column name)
matches("[1-4]"),
# Apply custom function
function(col) {
# Paste together
paste(
col, # the data in the current column
cur_data()[[which(names(cur_data()) == cur_column())+1]], # and the data in the next column along
sep = " "
)
},
.names = "{gsub(pattern = 'w', replacement = 'bigr_', {col})}" # alter name of new cols (replace 'w' with 'bigr_')
)
) %>%
# EDIT: added to rename columns to match desired output
rename_with(.cols = matches("bigr"),
.fn = function(colname) {
paste0(colname, "_", as.numeric(gsub(pattern = "bigr_", replacement = "", colname))+1)
})
df <- data.frame(
w1 = c("A", "B", "C", "E", "F", "G"),
w2 = c("B", "G", "C", "D", "E", "V"),
w3 = c("D", "S", "O", "F", NA, "N"),
w4 = c("E", "U", NA, "T", NA, NA),
w5 = c("C", NA, NA, NA, NA, NA)
)
library(tidyverse)
library(janitor)
df %>%
mutate(rn = row_number()) %>%
pivot_longer(-rn, values_drop_na = TRUE) %>%
group_by(rn) %>%
mutate(bigr = paste0(value, "_", lead(value))) %>%
mutate(bigr = if_else(str_detect(bigr, "_NA"), NA_character_, bigr)) %>%
pivot_wider(rn, names_from = c(name), values_from = c(value, bigr)) %>%
remove_empty("cols") %>%
ungroup() %>%
select(-rn) %>%
rename_with(~str_remove(string = ., "value_")) %>%
rename_with(~str_replace(., "(_w)(\\d+)", "_\\2"))
#> # A tibble: 6 × 9
#> w1 w2 w3 w4 w5 bigr_1 bigr_2 bigr_3 bigr_4
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 A B D E C A_B B_D D_E E_C
#> 2 B G S U <NA> B_G G_S S_U <NA>
#> 3 C C O <NA> <NA> C_C C_O <NA> <NA>
#> 4 E D F T <NA> E_D D_F F_T <NA>
#> 5 F E <NA> <NA> <NA> F_E <NA> <NA> <NA>
#> 6 G V N <NA> <NA> G_V V_N <NA> <NA>
Created on 2022-04-26 by the reprex package (v2.0.1)
As long as you don't have a string that is NA, you could try:
df %>%
mutate(across(-1,
~ paste(get(paste0("w", match(cur_column(), names(cur_data())) - 1)), .),
.names = 'bigr_{paste0("w", match(.col, names(cur_data())) - 1)}_{.col}')) %>%
mutate(across(starts_with("bigr"),
~ if_else(str_count(., "NA") != 0, NA_character_, .)))
w1 w2 w3 w4 w5 bigr_w1_w2 bigr_w2_w3 bigr_w3_w4 bigr_w4_w5
1 A B D E C A B B D D E E C
2 B G S U <NA> B G G S S U <NA>
3 C C O <NA> <NA> C C C O <NA> <NA>
4 E D F T <NA> E D D F F T <NA>
5 F E <NA> <NA> <NA> F E <NA> <NA> <NA>
6 G V N <NA> <NA> G V V N <NA> <NA>
As you are open to non-dplyr solutions, we can do it in base R by modifying your original code:
df[, paste0("bigr_", 1:4, "_", 2:5)] <- mapply(paste, df[, 1:4], df[, 2:5])
# as NA is coerced to character, we need to find those positions and correct
x <- which(is.na(df[, 1:4]) | is.na(df[, 2:5]), arr.ind = TRUE)
x[, 2] <- x[, 2] + 5
df[x] <- NA
df
# w1 w2 w3 w4 w5 bigr_1_2 bigr_2_3 bigr_3_4 bigr_4_5
# 1 A B D E C A B B D D E E C
# 2 B G S U <NA> B G G S S U <NA>
# 3 C C O <NA> <NA> C C C O <NA> <NA>
# 4 E D F T <NA> E D D F F T <NA>
# 5 F E <NA> <NA> <NA> F E <NA> <NA> <NA>
# 6 G V N <NA> <NA> G V V N <NA> <NA>
We can use the tidytext package as follows:
df %>%
rowid_to_column() %>%
unite(col, -rowid, sep = ' ') %>%
tidytext::unnest_ngrams(value, 'col', 2, to_lower = FALSE) %>%
group_by(rowid) %>%
mutate(name = row_number()) %>%
pivot_wider(rowid, names_prefix = 'bgram_')
# A tibble: 6 x 5
# Groups: rowid [6]
rowid bgram_1 bgram_2 bgram_3 bgram_4
<int> <chr> <chr> <chr> <chr>
1 1 A B B D D E E C
2 2 B G G S S U U NA
3 3 C C C O O NA NA NA
4 4 E D D F F T T NA
5 5 F E E NA NA NA NA NA
6 6 G V V N N NA NA NA
using data.table
df[, (paste("bigr", 1:4, 2:5, sep = "_")) := Map(function(x, y) ifelse(is.na(x) | is.na(y), NA, paste(x, y)), .SD[, 1:4], .SD[, 2:5])]

Replace certain columns in a data frame with the columns of another data frame

I have two data frames with the same columns names and the same size. Each of them has 40 columns and 5000 rows. I would like to replace certain columns in a data frame with those from the other df arranged by their common ID. The column ID is identical for both dfs but not necessarily in the same order for each df.
Let me provide an example for clarity.
df1 <- data.frame( ID = c("ID1", "ID2","ID3", "ID4","ID5", "ID6","ID7", "ID8", "ID9"),
A = c(1,2,3,4,5,6,7,8,9),
B = c(11,21,31,41,51,61,71,81,91),
C = c("a", "b", "c", "d", "e", "f", "g", "h", "i"),
D = c("a1","b1","c1", "d1","e1", "f1", "g1", "h1", "i1")
)
df1
df2 <- data.frame( ID = c("ID2", "ID1","ID3", "ID4","ID5", "ID6","ID9", "ID8", "ID7"),
A = sample(x = 1:20, size = 9),
B = sample(x = 1:50, size = 9),
C = c("A", "B", "C", "D", "E", "F", "G", "H", "I"),
D = c("A1","B1","C1", "D1","E1", "F1", "G1", "H1", "I1")
)
df2
This should be the df2 after replacing its columns, A, B with those from df1 while keeping the rest of the columns (C, D) unchanged.
df2_out <- data.frame( ID = c("ID2", "ID1","ID3", "ID4","ID5", "ID6","ID9", "ID8", "ID7"),
A = c(2,1,3,4,5,6,9,8,7),
B = c(21,11,31,41,51,61,91,81,71),
C = c("A", "B", "C", "D", "E", "F", "G", "H", "I"),
D = c("A1","B1","C1", "D1","E1", "F1", "G1", "H1", "I1")
)
As mentioned the number of the columns to be changed is long (30) in my data set:
changed_columns <- c("A", "B", ....)
any help on how to make it ?
Thank you
Using the data.table package, you can solve your problem as follows:
library(data.table)
setDT(df2)[df1, c("A", "B") := .(i.A, i.B), on = "ID"]
# ID A B C D
# 1: ID2 2 21 A A1
# 2: ID1 1 11 B B1
# 3: ID3 3 31 C C1
# 4: ID4 4 41 D D1
# 5: ID5 5 51 E E1
# 6: ID6 6 61 F F1
# 7: ID9 9 91 G G1
# 8: ID8 8 81 H H1
# 9: ID7 7 71 I I1
Another base R option by using merge + subset
df2_out <- subset(merge(df1[c("ID","A","B")],df2,all = TRUE,by = "ID"),select = -cbind(A.y,B.y))
such that
> df2_out
ID A.x B.x C D
1 ID1 1 11 B B1
2 ID2 2 21 A A1
3 ID3 3 31 C C1
4 ID4 4 41 D D1
5 ID5 5 51 E E1
6 ID6 6 61 F F1
7 ID7 7 71 I I1
8 ID8 8 81 H H1
9 ID9 9 91 G G1
We can use match to get the order of ID and replace them with changed_columns in df1.
changed_columns <- c("A", "B")
df2[match(df1$ID, df2$ID), changed_columns] <- df1[changed_columns]
df2
# ID A B C D
#1 ID2 2 21 A A1
#2 ID1 1 11 B B1
#3 ID3 3 31 C C1
#4 ID4 4 41 D D1
#5 ID5 5 51 E E1
#6 ID6 6 61 F F1
#7 ID9 9 91 G G1
#8 ID8 8 81 H H1
#9 ID7 7 71 I I1

Combining multiple column/ stacking multiple columns

So I have 9 column
a b c d e f g h i
1 1 t p 1 h p 1 v g
2 2 e h 2 j m 2 c f
3 3 f g 3 k l 3 b d
and i want to know how can I make them like this
a b c
1 1 t p
2 2 e h
3 3 f g
4 1 h p
5 2 j m
6 3 k l
7 1 v g
8 2 c f
9 3 b d
We can use reshape from base R by specifying the columns to combine together in a list of vectors
out <- reshape(df1, direction = 'long',
varying = list(c('a', 'd', 'g'), c('b', 'e', 'h'),
c('c', 'f', 'i')))[c('a', 'b', 'c')]
row.names(out) <- NULL
out
# a b c
#1 1 t p
#2 2 e h
#3 3 f g
#4 1 h p
#5 2 j m
#6 3 k l
#7 1 v g
#8 2 c f
#9 3 b d
Or using melt from data.table
library(data.table)
melt(setDT(df1), measure = list(c('a', 'd', 'g'), c('b', 'e', 'h'),
c('c', 'f', 'i')), value.name = c('a', 'b', 'c'))[, variable := NULL][]
data
df1 <- structure(list(a = 1:3, b = c("t", "e", "f"), c = c("p", "h",
"g"), d = 1:3, e = c("h", "j", "k"), f = c("p", "m", "l"), g = 1:3,
h = c("v", "c", "b"), i = c("g", "f", "d")),
class = "data.frame", row.names = c("1",
"2", "3"))
One option involving purrr could be:
map_dfc(.x = split.default(df, rep(1:3, length.out = length(df))),
~ stack(.)[1]) %>%
setNames(c("a", "b", "c"))
a b c
1 1 t p
2 2 e h
3 3 f g
4 1 h p
5 2 j m
6 3 k l
7 1 v g
8 2 c f
9 3 b d

Transform df into edges df for collaboration network

I have this df, which contains information on collaboration of articles:
author author2 author3 author4
1 A D E F
2 B G
3 C H F
I need to create an edges dataframe, which contains the relationship between the authors, like this:
from to
1 A D
2 A E
3 A F
4 B G
5 C H
6 C F
7 D E
8 D F
9 E F
11 H F
any ideas how to do it?
We can gather each column against the remaining columns i.e. to the left of that column and then binds all.
library(tidyverse)
map_dfr(names(df)[-length(df)], ~select(df,.x:ncol(df)) %>% gather( k,to,-.x) %>%
arrange(!!ensym(.x)) %>% select(-k) %>% filter(to!='') %>%
rename(form=starts_with('author')))
form to
1 A D
2 A E
3 A F
4 B G
5 C H
6 C F
7 D E
8 D F
9 H F
10 E F
Data
df <- structure(list(author = c("A", "B", "C"), author2 = c("D", "G",
"H"), author3 = c("E", "", "F"), author4 = c("F","", "")), class = "data.frame", row.names = c("1",
"2", "3"))
You could apply combn row-wise inside a function, no need for packages.
edges <- setNames(as.data.frame(do.call(rbind, lapply(seq(nrow(d)), function(x)
matrix(unlist(t(combn(na.omit(unlist(d[x, ])), 2))), ncol=2)))), c("from", "to"))
edges
# from to
# 1 A D
# 2 A E
# 3 A F
# 4 D E
# 5 D F
# 6 E F
# 7 B G
# 8 C H
# 9 C F
# 10 H F
Or, using igraph package as #akrun suggested.
library(igraph)
edges <- do.call(rbind, apply(d, 1, function(x)
as_data_frame(graph_from_data_frame(t(combn(na.omit(x), 2))))))
edges
# from to
# 1 A D
# 2 A E
# 3 A F
# 4 D E
# 5 D F
# 6 E F
# 7 B G
# 8 C H
# 9 C F
# 10 H F
Data
d <- structure(list(author = c("A", "B", "C"), author2 = c("D", "G",
"H"), author3 = c("E", NA, "F"), author4 = c("F", NA, NA)), row.names = c(NA,
-3L), class = "data.frame")

joining the first n factors (with different n) in R

A data frame contains ID, group, n (numeric), and several factor variables
ID <- c(1,2,3,4,5,6,7,8,9,10)
group <- c("m", "m", "m", "f", "f", "m", "m", "f", "f", "m")
n <- c(1,2,6,3,6,8,4,1,4,2)
b1 <- c("a", "b", "", "a", "d", "d", "a", "c", "c", "b")
b2 <- c("a", "", "e", "a", "d", "d", "a", "c", "c", "b")
b3 <- c("a", "b", "", "a", "", "d", "a", "c", "c", "b")
b4 <- c("a", "b", "e", "a", "", "d", "a", "c", "c", "b")
b5 <- c("a", "b", "e", "a", "d", "", "", "", "c", "b")
b6 <- c("a", "", "", "", "d", "d", "", "c", "c", "b")
df <- data.frame(ID, group, n, b1, b2, b3, b4, b5, b6)
I need to create a new character column (call it y).
They way to compute y is by joining the first n variables (b1,b2,b3,b4,b5,b6) and use comma to seperate them.
Note, in case a column is a blank, then remove it from the join.
For example, for ID=1, y = "a"; for ID = 2, y = "b" (not "b, "); for ID = 3, y = "e,e,e", etc.
And, the faster the code, the better.
A possible sollution, the speed might still be an issue:
df$y <- sapply(seq_len(nrow(df)), function(i){
cvec <- head(unlist(df[i, 4:9]), df$n[i])
cvec <- cvec[!cvec == '']
paste(cvec, collapse = ',')
})
# ID group n b1 b2 b3 b4 b5 b6 y
# 1 1 m 1 a a a a a a a
# 2 2 m 2 b b b b b
# 3 3 m 6 e e e e,e,e
# 4 4 f 3 a a a a a a,a,a
# 5 5 f 6 d d d d d,d,d,d
# 6 6 m 8 d d d d d d,d,d,d,d
# 7 7 m 4 a a a a a,a,a,a
# 8 8 f 1 c c c c c c
# 9 9 f 4 c c c c c c c,c,c,c
# 10 10 m 2 b b b b b b b,b
Here is an option using gsub and paste. We paste the 'b' columns of 'df' (do.call(paste0, df[-(1:3)]), then use substring to keep only the characters that suggested by 'n' column, use gsub to create the , in between each character.
df$y <- gsub("(?<=\\S)(?=\\S)", ",",
substring(do.call(paste0, df[-(1:3)]), 1, df$n), perl = TRUE)
df
# ID group n b1 b2 b3 b4 b5 b6 y
#1 1 m 1 a a a a a a a
#2 2 m 2 b b b b b,b
#3 3 m 6 e e e e,e,e
#4 4 f 3 a a a a a a,a,a
#5 5 f 6 d d d d d,d,d,d
#6 6 m 8 d d d d d d,d,d,d,d
#7 7 m 4 a a a a a,a,a,a
#8 8 f 1 c c c c c c
#9 9 f 4 c c c c c c c,c,c,c
#10 10 m 2 b b b b b b b,b
df$y <- apply(df, 1, function(r) {
gsub("\\s+", "\\,", trimws(paste(head(r[4:9], r["n"]), sep= " ", collapse = " ")))})
df
# ID group n b1 b2 b3 b4 b5 b6 y
# 1 1 m 1 a a a a a a a
# 2 2 m 2 b b b b b
# 3 3 m 6 e e e e,e,e
# 4 4 f 3 a a a a a a,a,a
# 5 5 f 6 d d d d d,d,d,d
# 6 6 m 8 d d d d d d,d,d,d,d
# 7 7 m 4 a a a a a,a,a,a
# 8 8 f 1 c c c c c c
# 9 9 f 4 c c c c c c c,c,c,c
# 10 10 m 2 b b b b b b b,b

Resources