Expand data frame in R by duplicating and substituting variable using piping - r

I would like to expand a data frame in the following way using:
GX will be substituted by three different variables Gs = (G4, G5, G6) what will all keep the attribute values of GX. For each new Gs the current GX rows must be duplicated and the GX name substituted by the corresponding Gs name.
set.seed(123)
df = data.frame(
"id" = c(rep("G1", 3), rep("G2", 3), rep("G3", 3), rep("GX",3)),
"subgroup" = rep(c(1,2,3), 4),
"total" = sample.int(n = 12),
"C1" = sample.int(n=12),
"C2" = sample.int(n=12),
"C3" = sample.int(n=12))
id subgroup total C1 C2 C3
1 G1 1 3 11 9 9
2 G1 2 12 5 3 12
3 G1 3 10 3 4 10
4 G2 1 2 9 1 7
5 G2 2 6 4 11 3
6 G2 3 11 1 7 4
7 G3 1 5 7 5 5
8 G3 2 4 12 10 6
9 G3 3 9 10 8 8
10 GX 1 8 2 2 2
11 GX 2 1 6 12 1
12 GX 3 7 8 6 11
I have one solution that includes a for loop:
Gs = c("G4", "G5", "G6")
for (ii in 1:length(Gs)) {
tmp.df <- df[df$id == "GX",]
tmp.df$id <- gsub(
pattern = "GX",
replacement = Gs[ii],
x = tmp.df$id
)
df <- rbind(df, tmp.df)
}
df = df[df$id != "GX",]
Which gives:
id subgroup total C1 C2 C3
1 G1 1 3 11 9 9
2 G1 2 12 5 3 12
3 G1 3 10 3 4 10
4 G2 1 2 9 1 7
5 G2 2 6 4 11 3
6 G2 3 11 1 7 4
7 G3 1 5 7 5 5
8 G3 2 4 12 10 6
9 G3 3 9 10 8 8
101 G4 1 8 2 2 2
111 G4 2 1 6 12 1
121 G4 3 7 8 6 11
102 G5 1 8 2 2 2
112 G5 2 1 6 12 1
122 G5 3 7 8 6 11
103 G6 1 8 2 2 2
113 G6 2 1 6 12 1
123 G6 3 7 8 6 11
However I would like to include the solution in a pipe and avoid the for loop solution. Is there any more R stylish approach using tidyverse syntax to be included in a pipe?
Thanks

We can select only rows where id == 'GX', create all it's combinations with Gs and bind the original dataframe to it removing the 'Gx' rows.
library(dplyr)
df %>%
filter(id == 'GX') %>%
tidyr::crossing(Gs) %>%
select(-id, id = Gs) %>%
bind_rows(df, .) %>%
filter(id != 'GX')
# id subgroup total C1 C2 C3
#1 G1 1 3 11 9 9
#2 G1 2 12 5 3 12
#3 G1 3 10 3 4 10
#4 G2 1 2 9 1 7
#5 G2 2 6 4 11 3
#6 G2 3 11 1 7 4
#7 G3 1 5 7 5 5
#8 G3 2 4 12 10 6
#9 G3 3 9 10 8 8
#10 G4 1 8 2 2 2
#11 G5 1 8 2 2 2
#12 G6 1 8 2 2 2
#13 G4 2 1 6 12 1
#14 G5 2 1 6 12 1
#15 G6 2 1 6 12 1
#16 G4 3 7 8 6 11
#17 G5 3 7 8 6 11
#18 G6 3 7 8 6 11

This question was previously answered by Ronak Shah here. Copying what he did there:
library(dplyr)
library(tidyr)
set.seed(123)
df = data.frame(
"id" = c(rep("G1", 3), rep("G2", 3), rep("G3", 3), rep("GX",3)),
"subgroup" = rep(c(1,2,3), 4),
"total" = sample.int(n = 12),
"C1" = sample.int(n=12),
"C2" = sample.int(n=12),
"C3" = sample.int(n=12))
df2 <- df %>%
filter(id == "GX") %>%
group_by(id) %>%
slice(rep(seq_len(n()), n()))
Gives you:
> df2
# A tibble: 9 x 6
# Groups: id [1]
id subgroup total C1 C2 C3
<fct> <dbl> <int> <int> <int> <int>
1 GX 1 8 2 2 2
2 GX 2 1 6 12 1
3 GX 3 7 8 6 11
4 GX 1 8 2 2 2
5 GX 2 1 6 12 1
6 GX 3 7 8 6 11
7 GX 1 8 2 2 2
8 GX 2 1 6 12 1
9 GX 3 7 8 6 11
Which is very close to what you are after.

Related

Iterate from one data frame to another, error

I want to fill the NA in df with the values in data frame dat and iterate over columns, but it doesn't work, why? Or is there a better solution?
id <- factor(rep(letters[1:2], each=5))
A <- c(1,2,NA,6,8,9,0,6,7,9)
B <- c(5,6,1,9,8,1,NA,9,7,4)
C <- c(2,3,5,NA,NA,2,7,6,4,6)
D <- c(6,5,8,3,2,9,NA,2,6,8)
df <- data.frame(id, A, B,C,D)
df
id A B C D
1 a 1 5 2 6
2 a 2 6 3 5
3 a NA 1 5 8
4 a 6 9 NA 3
5 a 8 8 NA 2
6 b 9 1 2 9
7 b 0 NA 7 NA
8 b 6 9 6 2
9 b 7 7 4 6
10 b 9 4 6 8
dat <- data.frame(col=c("A","B","C","D"), value=c(23,45,26,89))
dat
col value
1 A 23
2 B 45
3 C 26
4 D 89
test <- function(i){
df[,i][is.na(df[,i])] <- dat$value[dat$col==i]
return(df)
}
df <-df[,-1]
for(i in colnames(df)){
df[[i]] <- test(i)
}
df #DOESN'T WORK
Should look like:
df
id A B C D
1 a 1 5 2 6
2 a 2 6 3 5
3 a 23 1 5 8
4 a 6 9 26 3
5 a 8 8 26 2
6 b 9 1 2 9
7 b 0 45 7 89
8 b 6 9 6 2
9 b 7 7 4 6
10 b 9 4 6 8
the replace_na function from tidyr should do what you want.
library(tidyverse)
df %>%
replace_na(list(
"A" = 23,
"B" = 45,
"C" = 26,
"D" = 89
))

Convert dataframe from wide format to long format with key stored in row R

I'm using tidyverse but a base solution is welcome, too.
Is there a way to, without transposing, gather a dataframe but instead of the key being the column names, the key is stored in a row. For example, let's say I have a tibble called df.
df <- tibble(a = c(5,3,5,6,2,"G1"),
b = c(5,3,5,6,2,"G1"),
c = c(8,2,6,4,1,"G2"),
d = c(8,2,6,4,1,"G2"),
e = c(9,3,7,8,4,"G3"),
f = c(9,3,7,8,4,"G3"),
g = c(6,5,2,1,8,"G4"),
h = c(6,5,2,1,8,"G4"))
df
# A tibble: 6 x 8
a b c d e f g h
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 5 5 8 8 9 9 6 6
2 3 3 2 2 3 3 5 5
3 5 5 6 6 7 7 2 2
4 6 6 4 4 8 8 1 1
5 2 2 1 1 4 4 8 8
6 G1 G1 G2 G2 G3 G3 G4 G4
The groups to group by or gather on is in the bottom row. Is there a way to get df to have three columns only, such that the columns c, e, and g are gathered into column a, columns d, f, and h are gathered into column b and row 6 becomes column c? The result would look like:
tibble(a = c(5,3,5,6,2,8,2,6,4,1,9,3,7,8,4,6,5,2,1,8),
b = c(5,3,5,6,2,8,2,6,4,1,9,3,7,8,4,6,5,2,1,8),
c = c("G1","G1","G1","G1","G1","G2","G2","G2","G2","G2",
"G3","G3","G3","G3","G3","G4","G4","G4","G4","G4"))
# A tibble: 20 x 3
a b c
<dbl> <dbl> <chr>
1 5 5 G1
2 3 3 G1
3 5 5 G1
4 6 6 G1
5 2 2 G1
6 8 8 G2
7 2 2 G2
8 6 6 G2
9 4 4 G2
10 1 1 G2
11 9 9 G3
12 3 3 G3
13 7 7 G3
14 8 8 G3
15 4 4 G3
16 6 6 G4
17 5 5 G4
18 2 2 G4
19 1 1 G4
20 8 8 G4
I would like to avoid transposing because I need the row and column orders preserved until everything is properly labeled.
Here is one idea.
library(tidyverse)
df2 <- df %>%
t() %>%
as.data.frame(stringsAsFactors = FALSE) %>%
split(f = .$V6) %>%
map_dfr(~.x %>%
select(-V6) %>%
t() %>%
as.data.frame(stringsAsFactors = FALSE) %>%
setNames(c("a", "b")),
.id = "c") %>%
select(a, b, c) %>%
mutate_at(vars(-c), list(~as.numeric(.)))
df2
# a b c
# 1 5 5 G1
# 2 3 3 G1
# 3 5 5 G1
# 4 6 6 G1
# 5 2 2 G1
# 6 8 8 G2
# 7 2 2 G2
# 8 6 6 G2
# 9 4 4 G2
# 10 1 1 G2
# 11 9 9 G3
# 12 3 3 G3
# 13 7 7 G3
# 14 8 8 G3
# 15 4 4 G3
# 16 6 6 G4
# 17 5 5 G4
# 18 2 2 G4
# 19 1 1 G4
# 20 8 8 G4
Here is one implementation. We can split the tibble into a list of tibble based on the last row, loop through the list with imap, rename the colums to same column names ('a', 'b'), mutate to create the column 'c' with the list name and bind the rows
library(tidyverse)
df %>%
slice(-n()) %>%
split.default(df %>%
slice(n()) %>%
flatten_chr) %>%
imap_dfr(~ .x %>%
rename_all(~ c('a', 'b')) %>%
mutate(c = .y))
# A tibble: 20 x 3
# a b c
# <chr> <chr> <chr>
# 1 5 5 G1
# 2 3 3 G1
# 3 5 5 G1
# 4 6 6 G1
# 5 2 2 G1
# 6 8 8 G2
# 7 2 2 G2
# 8 6 6 G2
# 9 4 4 G2
#10 1 1 G2
#11 9 9 G3
#12 3 3 G3
#13 7 7 G3
#14 8 8 G3
#15 4 4 G3
#16 6 6 G4
#17 5 5 G4
#18 2 2 G4
#19 1 1 G4
#20 8 8 G4
Transposing probably doesn't hurt if you do it step by step. In this base R solution, row and column information is kept until the last line.
d <- data.frame(t(as.matrix(df)))
l <- lapply(split(d[-6], d$X6), t)
res <- do.call(rbind, Map(cbind, l, c=names(l)))
res <- setNames(data.frame(res, row.names=NULL), letters[1:3])
res
# a b c
# 1 5 5 G1
# 2 3 3 G1
# 3 5 5 G1
# 4 6 6 G1
# 5 2 2 G1
# 6 8 8 G2
# 7 2 2 G2
# 8 6 6 G2
# 9 4 4 G2
# 10 1 1 G2
# 11 9 9 G3
# 12 3 3 G3
# 13 7 7 G3
# 14 8 8 G3
# 15 4 4 G3
# 16 6 6 G4
# 17 5 5 G4
# 18 2 2 G4
# 19 1 1 G4
# 20 8 8 G4
One option with data.table
First, since we're not using the original names, replace them. Also remove the last row and convert everthing to integer.
library(data.table)
setDT(df)
df <- df[-.N]
df[, names(df) := lapply(.SD, as.integer)]
setnames(df, rep_len(c('a', 'b'), ncol(df)))
# a b a b a b a b
# 1: 5 5 8 8 9 9 6 6
# 2: 3 3 2 2 3 3 5 5
# 3: 5 5 6 6 7 7 2 2
# 4: 6 6 4 4 8 8 1 1
# 5: 2 2 1 1 4 4 8 8
Now melt on the row number, add the G[1-4] column, and dcast melted df to wide form.
df[, rid := 1:.N]
df2 <- melt(df, 'rid')
df2[, c := paste0('G', rowid(rid, variable))]
dcast(df2, rid + c ~ variable)[order(c), -'rid']
# c a b
# 1: G1 5 5
# 2: G1 3 3
# 3: G1 5 5
# 4: G1 6 6
# 5: G1 2 2
# 6: G2 8 8
# 7: G2 2 2
# 8: G2 6 6
# 9: G2 4 4
# 10: G2 1 1
# 11: G3 9 9
# 12: G3 3 3
# 13: G3 7 7
# 14: G3 8 8
# 15: G3 4 4
# 16: G4 6 6
# 17: G4 5 5
# 18: G4 2 2
# 19: G4 1 1
# 20: G4 8 8

Keep rows with specific string and the following row

This is my data frame
df <- data.frame(
id = 1:14,
group_id = c(rep(1:2, each = 3), rep(3:4, each = 4)),
type = rep("A", 14), stringsAsFactors = FALSE)
df[c(2,4,8,12),"type"] <- "B"
id group_id type
1 1 1 A
2 2 1 B
3 3 1 A
4 4 2 B
5 5 2 A
6 6 2 A
7 7 3 A
8 8 3 B
9 9 3 A
10 10 3 A
11 11 4 A
12 12 4 B
13 13 4 A
14 14 4 A
I'd like to keep all rows with type B as well as the following row.
I could do...
B <- which(df$type=="B")
afterB <- B+1
df_sel <- df[c(B, afterB), ]
df_sel <- df_sel[order(df_sel$id),]
df_sel
...to get what I want.
id group_id type
2 2 1 B
3 3 1 A
4 4 2 B
5 5 2 A
8 8 3 B
9 9 3 A
12 12 4 B
13 13 4 A
How can this be done in a more generic way.
Another way, very similar to what you do but in one step and without the need to reorder:
df_sel <- df[rep(which(df$type=="B"), e=2)+c(0, 1), ]
df_sel
# id group_id type
# 2 2 1 B
# 3 3 1 A
# 4 4 2 B
# 5 5 2 A
# 8 8 3 B
# 9 9 3 A
# 12 12 4 B
# 13 13 4 A
Using lag from dplyr
library(dplyr)
df[df$type == "B" | lag(df$type == "B", default = FALSE), ]
# id group_id type
#2 2 1 B
#3 3 1 A
#4 4 2 B
#5 5 2 A
#8 8 3 B
#9 9 3 A
#12 12 4 B
#13 13 4 A
using grep will provide a row index of all instances of B - rows; concatenate (c()) this with rows + 1 to select from df will work.
rows <- grep("B", df[, "type"])
df[sort(c(rows, rows + 1)), ]
gives:
id group_id type
2 2 1 B
3 3 1 A
4 4 2 B
5 5 2 A
8 8 3 B
9 9 3 A
12 12 4 B
13 13 4 A

How to mutate multiple variables without repeating codes?

I'm trying to create new variables from existing variables like below:
a1+a2=a3, b1+b2=b3, ..., z1+z2=z3
Here is an example data frame
df <- data.frame(replicate(10,sample(1:10)))
colnames(df) <- c("a1","a2","b1","b2","c1","c2","d1","d2","e1","e2")
Here's my solution with repeating codes
# a solution by base R
df$a3 <- df$a1 + df$a2
df$b3 <- df$b1 + df$b2
df$c3 <- df$c1 + df$c2
df$d3 <- df$d1 + df$d2
df$e3 <- df$e1 + df$e2
Or
# a solution by dplyr
library(dplyr)
df <- df %>%
mutate(a3 = a1+a2,
b3 = b1+b2,
c3 = c1+c2,
d3 = d1+d2,
e3 = e1+d2)
Or
# a solution by data.table
library(data.table)
DT <- data.table(df)
DT[,a3:=a1+a2][,b3:=b1+b2][,c3:=c1+c2][,d3:=d1+d2][,e3:=e1+e2]
Actually I have more than 100 variables, so I want to find a way to do so without repeating code... Although I tried to use mutate_ with standard evaluation and regular expression, I lost my way because I'm a newbie in R. Can you mutate multiple variables without repeating code?
Your data format is making this hard - I would reshape the data like this. In general, you shouldn't encode actual data information in column names, if the difference between a1 and a2 is meaningful, it is better to have a column with letter, a, b, c and a column with number, 1, 2.
df$id = 1:nrow(df)
library(tidyr)
library(dplyr)
tdf = gather(df, key = key, value = value, -id) %>%
separate(key, into = c("letter", "number"), sep = 1) %>%
mutate(number = paste0("V", number)) %>%
spread(key = number, value = value)
## now data is "tidy":
head(tdf)
# id letter V1 V2
# 1 1 a 2 7
# 2 1 b 10 4
# 3 1 c 9 10
# 4 1 d 9 4
# 5 1 e 5 8
# 6 2 a 9 8
## and the operation is simple:
tdf$V3 = tdf$V1 + tdf$V2
head(tdf)
# id letter V1 V2 V3
# 1 1 a 2 7 9
# 2 1 b 10 4 14
# 3 1 c 9 10 19
# 4 1 d 9 4 13
# 5 1 e 5 8 13
# 6 2 a 9 8 17
A possible solution using data.table:
DT <- data.table(df)[, rn := .I]
DTadd3 <- dcast(melt(DT, measure.vars = 1:10)[, `:=` (let = substr(variable,1,1), rn = 1:.N), variable
][, s3 := sum(value), .(let,rn)],
rn ~ paste0(let,3), value.var = 's3', mean)
DT[DTadd3, on = 'rn'][, rn := NULL][]
which gives:
a1 a2 b1 b2 c1 c2 d1 d2 e1 e2 a3 b3 c3 d3 e3
1: 10 5 9 5 10 4 5 3 7 10 15 14 14 8 17
2: 2 6 6 8 3 8 7 1 4 7 8 14 11 8 11
3: 6 4 7 4 4 3 4 6 3 3 10 11 7 10 6
4: 1 2 4 2 9 9 3 7 10 4 3 6 18 10 14
5: 9 10 8 1 8 7 10 5 9 1 19 9 15 15 10
6: 8 8 10 6 2 5 2 4 2 6 16 16 7 6 8
7: 7 9 1 7 5 10 9 2 1 8 16 8 15 11 9
8: 5 1 2 9 7 2 1 8 5 5 6 11 9 9 10
9: 3 7 3 3 1 6 8 10 8 9 10 6 7 18 17
10: 4 3 5 10 6 1 6 9 6 2 7 15 7 15 8
A similar solution using dplyr and tidyr:
df %>%
bind_cols(., df %>%
gather(var, val) %>%
group_by(var) %>%
mutate(let = substr(var,1,1), rn = 1:n()) %>%
group_by(let,rn) %>%
summarise(s3 = sum(val)) %>%
spread(let, s3) %>%
select(-rn)
)
However, as noted by #Gregor, it is much better to transform your data into long format. The data.table equivalent of #Gregor's answer:
DT <- data.table(df)
melt(DT[, rn := .I],
variable.name = 'let',
measure.vars = patterns('1$','2$'),
value.name = paste0('v',1:2)
)[, `:=` (let = letters[let], v3 = v1 + v2)][]
which gives (first 15 rows):
rn let v1 v2 v3
1: 1 a 10 5 15
2: 2 a 2 6 8
3: 3 a 6 4 10
4: 4 a 1 2 3
5: 5 a 9 10 19
6: 6 a 8 8 16
7: 7 a 7 9 16
8: 8 a 5 1 6
9: 9 a 3 7 10
10: 10 a 4 3 7
11: 1 b 9 5 14
12: 2 b 6 8 14
13: 3 b 7 4 11
14: 4 b 4 2 6
15: 5 b 8 1 9
My data.table solution:
sapply(c("a", "b", "c", "d", "e"), function(ll)
df[ , paste0(ll, 3) := get(paste0(ll, 1)) + get(paste0(ll, 2))])
df[]
# a1 a2 b1 b2 c1 c2 d1 d2 e1 e2 a3 b3 c3 d3 e3
# 1: 5 2 2 6 4 1 10 7 3 9 7 8 5 17 12
# 2: 4 8 7 3 3 7 9 6 9 7 12 10 10 15 16
# 3: 10 7 6 10 1 9 4 1 2 4 17 16 10 5 6
# 4: 3 4 1 7 6 4 7 4 7 5 7 8 10 11 12
# 5: 8 3 4 2 2 2 3 3 4 10 11 6 4 6 14
# 6: 6 6 5 1 8 10 1 10 5 3 12 6 18 11 8
# 7: 2 10 8 9 5 6 2 5 10 2 12 17 11 7 12
# 8: 1 1 10 8 9 5 6 9 6 8 2 18 14 15 14
# 9: 9 5 3 5 10 3 5 2 1 6 14 8 13 7 7
# 10: 7 9 9 4 7 8 8 8 8 1 16 13 15 16 9
Or, more extensibly:
sapply(c("a", "b", "c", "d", "e"), function(ll)
df[ , paste0(ll, 3) := Reduce(`+`, mget(paste0(ll, 1:2)))])
If all of the variables fit the pattern of ending with 1 or 2, you might try:
stems = unique(gsub("[0-9]", "", names(df)))
Then sapply(stems, ...)
library(tidyverse)
reduce(.init=df, .x=letters[1:5], .f~{
mutate(.x, '{.y}3' := get(str_c(.y, 1)) + get(str_c(.y, 2)))
})

R Partial Reshape Data from Long to Wide

I like to reshape a dataset from long to wide. Specifically, the new wide dataset should consist of rows corresponding to the unique number of IDs in the long dataset, and the number of columns is a multiple of unique values of another variable.
Let's say this is the original dataset:
ID a b C d e f g
1 1 1 1 1 2 3 4
1 1 1 2 5 6 7 8
2 2 2 1 1 2 3 4
2 2 2 3 9 0 1 2
2 2 2 2 5 6 7 8
3 3 3 3 9 0 1 2
3 3 3 2 5 6 7 8
3 3 3 1 1 2 3 4
In the new dataset, the number of rows is the number of IDs, the number of columns is 3 plus the multiple of unique elements found in variable C and the values from variables d to g are populated after sorting variable C in ascending order. It should look something like this:
ID a b d1 e1 f1 g1 d2 e2 f2 g2 d3 e3 f3 g3
1 1 1 1 2 3 4 5 6 7 8 NA NA NA NA
2 2 2 1 2 3 4 5 6 7 8 9 0 1 2
3 3 3 1 2 3 4 5 6 7 8 9 0 1 2
You can use dcast from data.table:
data.table::setDT(df)
data.table::dcast(df, ID + a + b ~ C, sep = "", value.var = c("d", "e", "f", "g"), fill=NA)
ID a b d1 d2 d3 e1 e2 e3 f1 f2 f3 g1 g2 g3
1: 1 1 1 1 5 NA 2 6 NA 3 7 NA 4 8 NA
2: 2 2 2 1 5 9 2 6 0 3 7 1 4 8 2
3: 3 3 3 1 5 9 2 6 0 3 7 1 4 8 2
Base reshape version - just have to use C as your time variable and away you go.
reshape(dat, idvar=c("ID","a","b"), direction="wide", timevar="C", sep="")
# ID a b d1 e1 f1 g1 d2 e2 f2 g2 d3 e3 f3 g3
#1 1 1 1 1 2 3 4 5 6 7 8 NA NA NA NA
#3 2 2 2 1 2 3 4 5 6 7 8 9 0 1 2
#6 3 3 3 1 2 3 4 5 6 7 8 9 0 1 2

Resources