Combining multiple column/ stacking multiple columns - r

So I have 9 column
a b c d e f g h i
1 1 t p 1 h p 1 v g
2 2 e h 2 j m 2 c f
3 3 f g 3 k l 3 b d
and i want to know how can I make them like this
a b c
1 1 t p
2 2 e h
3 3 f g
4 1 h p
5 2 j m
6 3 k l
7 1 v g
8 2 c f
9 3 b d

We can use reshape from base R by specifying the columns to combine together in a list of vectors
out <- reshape(df1, direction = 'long',
varying = list(c('a', 'd', 'g'), c('b', 'e', 'h'),
c('c', 'f', 'i')))[c('a', 'b', 'c')]
row.names(out) <- NULL
out
# a b c
#1 1 t p
#2 2 e h
#3 3 f g
#4 1 h p
#5 2 j m
#6 3 k l
#7 1 v g
#8 2 c f
#9 3 b d
Or using melt from data.table
library(data.table)
melt(setDT(df1), measure = list(c('a', 'd', 'g'), c('b', 'e', 'h'),
c('c', 'f', 'i')), value.name = c('a', 'b', 'c'))[, variable := NULL][]
data
df1 <- structure(list(a = 1:3, b = c("t", "e", "f"), c = c("p", "h",
"g"), d = 1:3, e = c("h", "j", "k"), f = c("p", "m", "l"), g = 1:3,
h = c("v", "c", "b"), i = c("g", "f", "d")),
class = "data.frame", row.names = c("1",
"2", "3"))

One option involving purrr could be:
map_dfc(.x = split.default(df, rep(1:3, length.out = length(df))),
~ stack(.)[1]) %>%
setNames(c("a", "b", "c"))
a b c
1 1 t p
2 2 e h
3 3 f g
4 1 h p
5 2 j m
6 3 k l
7 1 v g
8 2 c f
9 3 b d

Related

Replace certain columns in a data frame with the columns of another data frame

I have two data frames with the same columns names and the same size. Each of them has 40 columns and 5000 rows. I would like to replace certain columns in a data frame with those from the other df arranged by their common ID. The column ID is identical for both dfs but not necessarily in the same order for each df.
Let me provide an example for clarity.
df1 <- data.frame( ID = c("ID1", "ID2","ID3", "ID4","ID5", "ID6","ID7", "ID8", "ID9"),
A = c(1,2,3,4,5,6,7,8,9),
B = c(11,21,31,41,51,61,71,81,91),
C = c("a", "b", "c", "d", "e", "f", "g", "h", "i"),
D = c("a1","b1","c1", "d1","e1", "f1", "g1", "h1", "i1")
)
df1
df2 <- data.frame( ID = c("ID2", "ID1","ID3", "ID4","ID5", "ID6","ID9", "ID8", "ID7"),
A = sample(x = 1:20, size = 9),
B = sample(x = 1:50, size = 9),
C = c("A", "B", "C", "D", "E", "F", "G", "H", "I"),
D = c("A1","B1","C1", "D1","E1", "F1", "G1", "H1", "I1")
)
df2
This should be the df2 after replacing its columns, A, B with those from df1 while keeping the rest of the columns (C, D) unchanged.
df2_out <- data.frame( ID = c("ID2", "ID1","ID3", "ID4","ID5", "ID6","ID9", "ID8", "ID7"),
A = c(2,1,3,4,5,6,9,8,7),
B = c(21,11,31,41,51,61,91,81,71),
C = c("A", "B", "C", "D", "E", "F", "G", "H", "I"),
D = c("A1","B1","C1", "D1","E1", "F1", "G1", "H1", "I1")
)
As mentioned the number of the columns to be changed is long (30) in my data set:
changed_columns <- c("A", "B", ....)
any help on how to make it ?
Thank you
Using the data.table package, you can solve your problem as follows:
library(data.table)
setDT(df2)[df1, c("A", "B") := .(i.A, i.B), on = "ID"]
# ID A B C D
# 1: ID2 2 21 A A1
# 2: ID1 1 11 B B1
# 3: ID3 3 31 C C1
# 4: ID4 4 41 D D1
# 5: ID5 5 51 E E1
# 6: ID6 6 61 F F1
# 7: ID9 9 91 G G1
# 8: ID8 8 81 H H1
# 9: ID7 7 71 I I1
Another base R option by using merge + subset
df2_out <- subset(merge(df1[c("ID","A","B")],df2,all = TRUE,by = "ID"),select = -cbind(A.y,B.y))
such that
> df2_out
ID A.x B.x C D
1 ID1 1 11 B B1
2 ID2 2 21 A A1
3 ID3 3 31 C C1
4 ID4 4 41 D D1
5 ID5 5 51 E E1
6 ID6 6 61 F F1
7 ID7 7 71 I I1
8 ID8 8 81 H H1
9 ID9 9 91 G G1
We can use match to get the order of ID and replace them with changed_columns in df1.
changed_columns <- c("A", "B")
df2[match(df1$ID, df2$ID), changed_columns] <- df1[changed_columns]
df2
# ID A B C D
#1 ID2 2 21 A A1
#2 ID1 1 11 B B1
#3 ID3 3 31 C C1
#4 ID4 4 41 D D1
#5 ID5 5 51 E E1
#6 ID6 6 61 F F1
#7 ID9 9 91 G G1
#8 ID8 8 81 H H1
#9 ID7 7 71 I I1

Conditional column calculation with different value for every second rows in r

Given column 1 that has A,B, and C values, how to create column 2 under this condition:
- If column 1 is either A or B, column 2 would be F,F1,F,F1 (every second cell be F1), otherwise, same C.
We can use transform with ifelse
transform(df, Col2 = ifelse(Col1 %in% c("A", "B"), c("F", "F1"), Col1))
# Col1 Col2
#1 A F
#2 A F1
#3 A F
#4 A F1
#5 A F
#6 A F1
#7 B F
#8 B F1
#9 B F
#10 B F1
#11 B F
#12 C C
#13 C C
#14 C C
#15 C C
Probably, using it by group is more appropriate.
library(dplyr)
df %>%
group_by(Col1) %>%
mutate(Col2 = ifelse(Col1 %in% c("A", "B"), c("F", "F1"), Col1))
data
df <- data.frame(Col1 = rep(c("A", "B", "C"), 6:4), stringsAsFactors = FALSE)

Transform df into edges df for collaboration network

I have this df, which contains information on collaboration of articles:
author author2 author3 author4
1 A D E F
2 B G
3 C H F
I need to create an edges dataframe, which contains the relationship between the authors, like this:
from to
1 A D
2 A E
3 A F
4 B G
5 C H
6 C F
7 D E
8 D F
9 E F
11 H F
any ideas how to do it?
We can gather each column against the remaining columns i.e. to the left of that column and then binds all.
library(tidyverse)
map_dfr(names(df)[-length(df)], ~select(df,.x:ncol(df)) %>% gather( k,to,-.x) %>%
arrange(!!ensym(.x)) %>% select(-k) %>% filter(to!='') %>%
rename(form=starts_with('author')))
form to
1 A D
2 A E
3 A F
4 B G
5 C H
6 C F
7 D E
8 D F
9 H F
10 E F
Data
df <- structure(list(author = c("A", "B", "C"), author2 = c("D", "G",
"H"), author3 = c("E", "", "F"), author4 = c("F","", "")), class = "data.frame", row.names = c("1",
"2", "3"))
You could apply combn row-wise inside a function, no need for packages.
edges <- setNames(as.data.frame(do.call(rbind, lapply(seq(nrow(d)), function(x)
matrix(unlist(t(combn(na.omit(unlist(d[x, ])), 2))), ncol=2)))), c("from", "to"))
edges
# from to
# 1 A D
# 2 A E
# 3 A F
# 4 D E
# 5 D F
# 6 E F
# 7 B G
# 8 C H
# 9 C F
# 10 H F
Or, using igraph package as #akrun suggested.
library(igraph)
edges <- do.call(rbind, apply(d, 1, function(x)
as_data_frame(graph_from_data_frame(t(combn(na.omit(x), 2))))))
edges
# from to
# 1 A D
# 2 A E
# 3 A F
# 4 D E
# 5 D F
# 6 E F
# 7 B G
# 8 C H
# 9 C F
# 10 H F
Data
d <- structure(list(author = c("A", "B", "C"), author2 = c("D", "G",
"H"), author3 = c("E", NA, "F"), author4 = c("F", NA, NA)), row.names = c(NA,
-3L), class = "data.frame")

joining the first n factors (with different n) in R

A data frame contains ID, group, n (numeric), and several factor variables
ID <- c(1,2,3,4,5,6,7,8,9,10)
group <- c("m", "m", "m", "f", "f", "m", "m", "f", "f", "m")
n <- c(1,2,6,3,6,8,4,1,4,2)
b1 <- c("a", "b", "", "a", "d", "d", "a", "c", "c", "b")
b2 <- c("a", "", "e", "a", "d", "d", "a", "c", "c", "b")
b3 <- c("a", "b", "", "a", "", "d", "a", "c", "c", "b")
b4 <- c("a", "b", "e", "a", "", "d", "a", "c", "c", "b")
b5 <- c("a", "b", "e", "a", "d", "", "", "", "c", "b")
b6 <- c("a", "", "", "", "d", "d", "", "c", "c", "b")
df <- data.frame(ID, group, n, b1, b2, b3, b4, b5, b6)
I need to create a new character column (call it y).
They way to compute y is by joining the first n variables (b1,b2,b3,b4,b5,b6) and use comma to seperate them.
Note, in case a column is a blank, then remove it from the join.
For example, for ID=1, y = "a"; for ID = 2, y = "b" (not "b, "); for ID = 3, y = "e,e,e", etc.
And, the faster the code, the better.
A possible sollution, the speed might still be an issue:
df$y <- sapply(seq_len(nrow(df)), function(i){
cvec <- head(unlist(df[i, 4:9]), df$n[i])
cvec <- cvec[!cvec == '']
paste(cvec, collapse = ',')
})
# ID group n b1 b2 b3 b4 b5 b6 y
# 1 1 m 1 a a a a a a a
# 2 2 m 2 b b b b b
# 3 3 m 6 e e e e,e,e
# 4 4 f 3 a a a a a a,a,a
# 5 5 f 6 d d d d d,d,d,d
# 6 6 m 8 d d d d d d,d,d,d,d
# 7 7 m 4 a a a a a,a,a,a
# 8 8 f 1 c c c c c c
# 9 9 f 4 c c c c c c c,c,c,c
# 10 10 m 2 b b b b b b b,b
Here is an option using gsub and paste. We paste the 'b' columns of 'df' (do.call(paste0, df[-(1:3)]), then use substring to keep only the characters that suggested by 'n' column, use gsub to create the , in between each character.
df$y <- gsub("(?<=\\S)(?=\\S)", ",",
substring(do.call(paste0, df[-(1:3)]), 1, df$n), perl = TRUE)
df
# ID group n b1 b2 b3 b4 b5 b6 y
#1 1 m 1 a a a a a a a
#2 2 m 2 b b b b b,b
#3 3 m 6 e e e e,e,e
#4 4 f 3 a a a a a a,a,a
#5 5 f 6 d d d d d,d,d,d
#6 6 m 8 d d d d d d,d,d,d,d
#7 7 m 4 a a a a a,a,a,a
#8 8 f 1 c c c c c c
#9 9 f 4 c c c c c c c,c,c,c
#10 10 m 2 b b b b b b b,b
df$y <- apply(df, 1, function(r) {
gsub("\\s+", "\\,", trimws(paste(head(r[4:9], r["n"]), sep= " ", collapse = " ")))})
df
# ID group n b1 b2 b3 b4 b5 b6 y
# 1 1 m 1 a a a a a a a
# 2 2 m 2 b b b b b
# 3 3 m 6 e e e e,e,e
# 4 4 f 3 a a a a a a,a,a
# 5 5 f 6 d d d d d,d,d,d
# 6 6 m 8 d d d d d d,d,d,d,d
# 7 7 m 4 a a a a a,a,a,a
# 8 8 f 1 c c c c c c
# 9 9 f 4 c c c c c c c,c,c,c
# 10 10 m 2 b b b b b b b,b

Frequency count for a specific category

I have a data set like this.
a <- structure(list(Prone = c("M", "N", "N", "N", "M", "N", "M", "N", "M", "M"),
Type = c("A", "B", "C", "A", "A", "A", "B", "B", "C", "B"),
Alc = c("A", "B", "N", "A", "A", "A", "B", "B", "B", "B"),
Com = c("Y", "N", "Y", "Y", "Y", "Y", "Y", "N", "N", "Y")),
.Names = c("Prone", "Type", "Alc", "Com"), row.names = c(NA, -10L), class = "data.frame")
a
Prone Type Alc Com
1 M A A Y
2 N B B N
3 N C N Y
4 N A A Y
5 M A A Y
6 N A A Y
7 M B B Y
8 N B B N
9 M C B N
10 M B B Y
I like to get the frequency count of each unique row like the following:
Prone Type Alc Com Freq
1 M A A Y 2
2 M B B Y 2
3 M C B N 1
4 N A A Y 2
5 N B B N 2
6 N C N Y 1
Thanks in advance.
Alternate plyr solution:
> library("plyr")
> count(a)
Prone Type Alc Com freq
1 M A A Y 2
2 M B B Y 2
3 M C B N 1
4 N A A Y 2
5 N B B N 2
6 N C N Y 1
The mandatory data.table solution:
library(data.table)
dt = data.table(a)
dt[, list(Freq = .N), by = names(dt)]
There are a lot of ways to do this, here is a simple plyr example:
> library(plyr)
> ddply(a,names(a),summarize,Freq=length(Prone))
Prone Type Alc Com Freq
1 M A A Y 2
2 M B B Y 2
3 M C B N 1
4 N A A Y 2
5 N B B N 2
6 N C N Y 1
Using base aggregate:
aggregate(data = transform(a, Freq = seq_len(nrow(a))), Freq ~ ., length)
Prone Type Alc Com Freq
1 N B B N 2
2 M C B N 1
3 M A A Y 2
4 N A A Y 2
5 M B B Y 2
6 N C N Y 1
Here's another approach:
library(qdap)
colsplit2df(data.frame(table(paste2(a))), new.names = names(a))
## > colsplit2df(data.frame(table(paste2(a))), new.names = names(a))
## Prone Type Alc Com Freq
## 1 M A A Y 2
## 2 M B B Y 2
## 3 M C B N 1
## 4 N A A Y 2
## 5 N B B N 2
## 6 N C N Y 1

Resources