I'm looking to transform a data frame of 660 columns into 3 columns just by stacking them on each other per 3 columns without manually re-arranging (since I have 660 columns).
In a small scale example per 2 columns with just 4 columns, I want to go from
A B C D
1 4 7 10
2 5 8 11
3 6 9 12
to
A B
1 4
2 5
3 6
7 10
8 11
9 12
Thanks
reshape to the rescue:
reshape(df, direction="long", varying=split(names(df), rep(seq_len(ncol(df)/2), 2)))
# time A B id
#1.1 1 1 4 1
#2.1 1 2 5 2
#3.1 1 3 6 3
#1.2 2 7 10 1
#2.2 2 8 11 2
#3.2 2 9 12 3
rbind.data.frame requires that all columns match up. So use setNames to replace the names of the C:D columns:
rbind( dat[1:2], setNames(dat[3:4], names(dat[1:2])) )
A B
1 1 4
2 2 5
3 3 6
4 7 10
5 8 11
6 9 12
To generalize that to multiple columns use do.call and lapply:
dat <- setNames( as.data.frame( matrix(1:36, ncol=12) ), LETTERS[1:12])
dat
#----
A B C D E F G H I J K L
1 1 4 7 10 13 16 19 22 25 28 31 34
2 2 5 8 11 14 17 20 23 26 29 32 35
3 3 6 9 12 15 18 21 24 27 30 33 36
do.call( rbind, lapply( seq(1,12, by=3), function(x) setNames(dat[x:(x+2)], LETTERS[1:3]) ))
A B C
1 1 4 7
2 2 5 8
3 3 6 9
4 10 13 16
5 11 14 17
6 12 15 18
7 19 22 25
8 20 23 26
9 21 24 27
10 28 31 34
11 29 32 35
12 30 33 36
The 12 would be replaced by 660 and everything else should work.
A classical split-apply-combine approach will scale flexibly:
as.data.frame(lapply(split(unclass(df),
names(df)[seq(ncol(df) / 2)]),
unlist, use.names = FALSE))
## A B
## 1 1 4
## 2 2 5
## 3 3 6
## 4 7 10
## 5 8 11
## 6 9 12
or with a hint of purrr,
library(purrr)
df %>% unclass() %>% # convert to non-data.frame list
split(names(.)[seq(length(.) / 2)]) %>% # split columns by indexed names
map_df(simplify) # simplify each split to vector, coerce back to data.frame
## # A tibble: 6 × 2
## A B
## <int> <int>
## 1 1 4
## 2 2 5
## 3 3 6
## 4 7 10
## 5 8 11
## 6 9 12
Here is another base R option
i1 <- c(TRUE, FALSE)
`row.names<-`(data.frame(A= unlist(df1[i1]), B = unlist(df1[!i1])), NULL)
# A B
#1 1 4
#2 2 5
#3 3 6
#4 7 10
#5 8 11
#6 9 12
Or another option is melt from data.table
library(data.table)
i1 <- seq(1, ncol(df1), by = 2)
i2 <- seq(2, ncol(df1), by = 2)
melt(setDT(df1), measure = list(i1, i2), value.name = c("A", "B"))
rbindlist from data.table package can also be used for the task and seems to be much more efficient.
# EXAMPLE DATA
df1 <- read.table(text = '
Col1 Col2 Col3 Col4
1 2 3 4
5 6 7 8
1 2 3 4
5 6 7 8', header = TRUE)
library(data.table)
library(microbenchmark)
library(purrr)
microbenchmark(
Map = as.data.frame(Map(c, df1[,1:2], df1[, 3:4])),
Reshape = reshape(df1, direction="long", varying=split(names(df1), rep(seq_len(ncol(df1)/2), 2))),
Purr = df1 %>% unclass() %>% # convert to non-data.frame list
split(names(.)[seq(length(.) / 2)]) %>% # split columns by indexed names
map_df(simplify),
DataTable = rbindlist(list(df1[,1:2], df1[, 3:4])),
Mapply = data.frame(mapply(c, df1[,1:2], df1[, 3:4], SIMPLIFY=FALSE)),
Rbind = rbind(df1[, 1:2],setnames(df1[, 3:4],names(df1[,1:2])))
)
The results are:
Unit: microseconds
expr min lq mean median uq max neval cld
Map 214.724 232.9380 246.2936 244.1240 255.9240 343.611 100 bc
Reshape 716.962 739.8940 778.7912 749.7550 767.6725 2834.192 100 e
Purr 309.559 324.6545 339.2973 334.0440 343.4290 551.746 100 d
DataTable 98.228 111.6080 122.7753 119.2320 129.2640 189.614 100 a
Mapply 233.577 258.2605 271.1881 270.7895 281.6305 339.291 100 c
Rbind 206.001 221.1515 228.5956 226.6850 235.2670 283.957 100 b
Related
I'm trying to better understant the data.table package in r. I want to do different types of calculation with some columns and assign the result to new columns with specific names. Here is an example:
set.seed(122)
df <- data.frame(rain = rep(5,10),temp=1:10, skip = sample(0:2,10,T),
windw_sz = sample(1:2,10,T),city =c(rep("a",5),rep("b",5)),ord=rep(sample(1:5,5),2))
df <- as.data.table(df)
vars <- c("rain","temp")
df[, paste0("mean.",vars) := lapply(mget(vars),mean), by="city" ]
This works just fine. But now I also want to calculate the sum of these variables, so I try:
df[, c(paste0("mean.",vars), paste("sum.",vars)) := list( lapply(mget(vars),mean),
lapply(mget(vars),sum)), by="city" ]
and I get an error.
How could I implement this last part?
Thanks a lot!
Instead of list wrap, we can do a c as the lapply output is a list, and when do list as wrapper, it returns a list of list. However, with c, it concats two list end to end (i.e. c(as.list(1:5), as.list(6:10)) as opposed to list(as.list(1:5), as.list(6:10))) and instead of mget, make use of .SDcols
library(data.table)
df[, paste0(rep(c("mean.", "sum."), each = 2), vars) :=
c(lapply(.SD, mean), lapply(.SD, sum)), by = .(city), .SDcols = vars]
df
# rain temp skip windw_sz city ord mean.rain mean.temp sum.rain sum.temp
# 1: 5 1 0 2 a 2 5 3 25 15
# 2: 5 2 1 1 a 5 5 3 25 15
# 3: 5 3 2 2 a 3 5 3 25 15
# 4: 5 4 2 1 a 4 5 3 25 15
# 5: 5 5 2 2 a 1 5 3 25 15
# 6: 5 6 0 1 b 2 5 8 25 40
# 7: 5 7 2 2 b 5 5 8 25 40
# 8: 5 8 1 2 b 3 5 8 25 40
# 9: 5 9 2 1 b 4 5 8 25 40
#10: 5 10 2 2 b 1 5 8 25 40
This code removes all columns which contain at least one NA.
library(dplyr)
df %>%
select_if(~ !any(is.na(.)))
What do I need to modify if I want only remove the columns that have NA for the eighth row (for my generated data below)?
set.seed(1234)
df <- data.frame(A = 1:10, B = 11:20, C = 21:30)
df <- as.data.frame(lapply(df, function(cc) cc[ sample(c(TRUE, NA), prob = c(0.85, 0.15), size = length(cc), replace = TRUE) ]))
In base-R one can simply try as:
df[,which(!is.na(df[8,]))]
Or as suggested by #RichScriven:
df[, !is.na(df[8,])]
# A B
# 1 1 11
# 2 2 12
# 3 3 13
# 4 4 NA
# 5 NA 15
# 6 6 16
# 7 7 17
# 8 8 18
# 9 9 19
# 10 10 20
You could do this:
df %>%
select_if(!is.na(.[8,]))
A B
1 1 11
2 2 12
3 3 13
4 4 NA
5 NA 15
6 6 16
7 7 17
8 8 18
9 9 19
10 10 20
Another option is keep
library(purrr)
keep(df, ~ !(is.na(.x[8])))
# A B
#1 1 11
#2 2 12
#3 3 13
#4 4 NA
#5 NA 15
#6 6 16
#7 7 17
#8 8 18
#9 9 19
#10 10 20
Or with Filter from base R
Filter(function(x) !(is.na(x[8])), df)
I'm trying to create new variables from existing variables like below:
a1+a2=a3, b1+b2=b3, ..., z1+z2=z3
Here is an example data frame
df <- data.frame(replicate(10,sample(1:10)))
colnames(df) <- c("a1","a2","b1","b2","c1","c2","d1","d2","e1","e2")
Here's my solution with repeating codes
# a solution by base R
df$a3 <- df$a1 + df$a2
df$b3 <- df$b1 + df$b2
df$c3 <- df$c1 + df$c2
df$d3 <- df$d1 + df$d2
df$e3 <- df$e1 + df$e2
Or
# a solution by dplyr
library(dplyr)
df <- df %>%
mutate(a3 = a1+a2,
b3 = b1+b2,
c3 = c1+c2,
d3 = d1+d2,
e3 = e1+d2)
Or
# a solution by data.table
library(data.table)
DT <- data.table(df)
DT[,a3:=a1+a2][,b3:=b1+b2][,c3:=c1+c2][,d3:=d1+d2][,e3:=e1+e2]
Actually I have more than 100 variables, so I want to find a way to do so without repeating code... Although I tried to use mutate_ with standard evaluation and regular expression, I lost my way because I'm a newbie in R. Can you mutate multiple variables without repeating code?
Your data format is making this hard - I would reshape the data like this. In general, you shouldn't encode actual data information in column names, if the difference between a1 and a2 is meaningful, it is better to have a column with letter, a, b, c and a column with number, 1, 2.
df$id = 1:nrow(df)
library(tidyr)
library(dplyr)
tdf = gather(df, key = key, value = value, -id) %>%
separate(key, into = c("letter", "number"), sep = 1) %>%
mutate(number = paste0("V", number)) %>%
spread(key = number, value = value)
## now data is "tidy":
head(tdf)
# id letter V1 V2
# 1 1 a 2 7
# 2 1 b 10 4
# 3 1 c 9 10
# 4 1 d 9 4
# 5 1 e 5 8
# 6 2 a 9 8
## and the operation is simple:
tdf$V3 = tdf$V1 + tdf$V2
head(tdf)
# id letter V1 V2 V3
# 1 1 a 2 7 9
# 2 1 b 10 4 14
# 3 1 c 9 10 19
# 4 1 d 9 4 13
# 5 1 e 5 8 13
# 6 2 a 9 8 17
A possible solution using data.table:
DT <- data.table(df)[, rn := .I]
DTadd3 <- dcast(melt(DT, measure.vars = 1:10)[, `:=` (let = substr(variable,1,1), rn = 1:.N), variable
][, s3 := sum(value), .(let,rn)],
rn ~ paste0(let,3), value.var = 's3', mean)
DT[DTadd3, on = 'rn'][, rn := NULL][]
which gives:
a1 a2 b1 b2 c1 c2 d1 d2 e1 e2 a3 b3 c3 d3 e3
1: 10 5 9 5 10 4 5 3 7 10 15 14 14 8 17
2: 2 6 6 8 3 8 7 1 4 7 8 14 11 8 11
3: 6 4 7 4 4 3 4 6 3 3 10 11 7 10 6
4: 1 2 4 2 9 9 3 7 10 4 3 6 18 10 14
5: 9 10 8 1 8 7 10 5 9 1 19 9 15 15 10
6: 8 8 10 6 2 5 2 4 2 6 16 16 7 6 8
7: 7 9 1 7 5 10 9 2 1 8 16 8 15 11 9
8: 5 1 2 9 7 2 1 8 5 5 6 11 9 9 10
9: 3 7 3 3 1 6 8 10 8 9 10 6 7 18 17
10: 4 3 5 10 6 1 6 9 6 2 7 15 7 15 8
A similar solution using dplyr and tidyr:
df %>%
bind_cols(., df %>%
gather(var, val) %>%
group_by(var) %>%
mutate(let = substr(var,1,1), rn = 1:n()) %>%
group_by(let,rn) %>%
summarise(s3 = sum(val)) %>%
spread(let, s3) %>%
select(-rn)
)
However, as noted by #Gregor, it is much better to transform your data into long format. The data.table equivalent of #Gregor's answer:
DT <- data.table(df)
melt(DT[, rn := .I],
variable.name = 'let',
measure.vars = patterns('1$','2$'),
value.name = paste0('v',1:2)
)[, `:=` (let = letters[let], v3 = v1 + v2)][]
which gives (first 15 rows):
rn let v1 v2 v3
1: 1 a 10 5 15
2: 2 a 2 6 8
3: 3 a 6 4 10
4: 4 a 1 2 3
5: 5 a 9 10 19
6: 6 a 8 8 16
7: 7 a 7 9 16
8: 8 a 5 1 6
9: 9 a 3 7 10
10: 10 a 4 3 7
11: 1 b 9 5 14
12: 2 b 6 8 14
13: 3 b 7 4 11
14: 4 b 4 2 6
15: 5 b 8 1 9
My data.table solution:
sapply(c("a", "b", "c", "d", "e"), function(ll)
df[ , paste0(ll, 3) := get(paste0(ll, 1)) + get(paste0(ll, 2))])
df[]
# a1 a2 b1 b2 c1 c2 d1 d2 e1 e2 a3 b3 c3 d3 e3
# 1: 5 2 2 6 4 1 10 7 3 9 7 8 5 17 12
# 2: 4 8 7 3 3 7 9 6 9 7 12 10 10 15 16
# 3: 10 7 6 10 1 9 4 1 2 4 17 16 10 5 6
# 4: 3 4 1 7 6 4 7 4 7 5 7 8 10 11 12
# 5: 8 3 4 2 2 2 3 3 4 10 11 6 4 6 14
# 6: 6 6 5 1 8 10 1 10 5 3 12 6 18 11 8
# 7: 2 10 8 9 5 6 2 5 10 2 12 17 11 7 12
# 8: 1 1 10 8 9 5 6 9 6 8 2 18 14 15 14
# 9: 9 5 3 5 10 3 5 2 1 6 14 8 13 7 7
# 10: 7 9 9 4 7 8 8 8 8 1 16 13 15 16 9
Or, more extensibly:
sapply(c("a", "b", "c", "d", "e"), function(ll)
df[ , paste0(ll, 3) := Reduce(`+`, mget(paste0(ll, 1:2)))])
If all of the variables fit the pattern of ending with 1 or 2, you might try:
stems = unique(gsub("[0-9]", "", names(df)))
Then sapply(stems, ...)
library(tidyverse)
reduce(.init=df, .x=letters[1:5], .f~{
mutate(.x, '{.y}3' := get(str_c(.y, 1)) + get(str_c(.y, 2)))
})
I have a problem to combine two different dimension dataframes which each dataframe has huge rows. Let's say, the sample of my dataframes are d and e, and new expected dataframe is de. I would like to make pair between all value in same row both in d and e, and construct those pairs in a new dataframe (de). Any idea/help for solving my problem is really appreciated. Thanks
> d <- data.frame(v1 = c(1,3,5), v2 = c(2,4,6))
> d
v1 v2
1 1 2
2 3 4
3 5 6
> e <- data.frame(v1 = c(11, 14), v2 = c(12,15), v3=c(13,16))
> e
v1 v2 v3
1 11 12 13
2 14 15 16
> de <- data.frame(x = c(1,1,1,2,2,2,3,3,3,4,4,4), y = c(11,12,13,11,12,13,14,15,16,14,15,16))
> de
x y
1 1 11
2 1 12
3 1 13
4 2 11
5 2 12
6 2 13
7 3 14
8 3 15
9 3 16
10 4 14
11 4 15
12 4 16
One solution is to "melt" d and e into long format, then merge, then get rid of the extra columns. If you have very large datasets, data tables are much faster (no difference for this tiny dataset).
library(reshape2) # for melt(...)
library(data.table)
# add id column
d <- cbind(id=1:nrow(d),d)
e <- cbind(id=1:nrow(e),e)
# melt to long format
d.melt <- data.table(melt(d,id.vars="id"), key="id")
e.melt <- data.table(melt(e,id.vars="id"), key="id")
# data table join, remove extra columns
result <- d.melt[e.melt, allow.cartesian=T]
result[,":="(id=NULL,variable=NULL,variable.1=NULL)]
setnames(result,c("x","y"))
setkey(result,x,y)
result
x y
1: 1 12
2: 1 13
3: 1 14
4: 2 12
5: 2 13
6: 2 14
7: 3 15
8: 3 16
9: 3 17
10: 4 15
11: 4 16
12: 4 17
If your data are numeric, like they are in this example, this is pretty straightforward in base R too. Conceptually this is the same as #jlhoward's answer: get your data into a long format, and merge:
merge(cbind(id = rownames(d), stack(d)),
cbind(id = rownames(e), stack(e)),
by = "id")[c("values.x", "values.y")]
# values.x values.y
# 1 1 11
# 2 1 12
# 3 1 13
# 4 2 11
# 5 2 12
# 6 2 13
# 7 3 14
# 8 3 15
# 9 3 16
# 10 4 14
# 11 4 15
# 12 4 16
Or, with the "reshape2" package:
merge(melt(as.matrix(d)),
melt(as.matrix(e)),
by = "Var1")[c("value.x", "value.y")]
With data frame:
df <- data.frame(id = rep(1:3, each = 5)
, hour = rep(1:5, 3)
, value = sample(1:15))
I want to add a cumulative sum column that matches the id:
df
id hour value csum
1 1 1 7 7
2 1 2 9 16
3 1 3 15 31
4 1 4 11 42
5 1 5 14 56
6 2 1 10 10
7 2 2 2 12
8 2 3 5 17
9 2 4 6 23
10 2 5 4 27
11 3 1 1 1
12 3 2 13 14
13 3 3 8 22
14 3 4 3 25
15 3 5 12 37
How can I do this efficiently? Thanks!
df$csum <- ave(df$value, df$id, FUN=cumsum)
ave is the "go-to" function if you want a by-group vector of equal length to an existing vector and it can be computed from those sub vectors alone. If you need by-group processing based on multiple "parallel" values, the base strategy is do.call(rbind, by(dfrm, grp, FUN)).
To add to the alternatives, data.table's syntax is nice:
library(data.table)
DT <- data.table(df, key = "id")
DT[, csum := cumsum(value), by = key(DT)]
Or, more compactly:
library(data.table)
setDT(df)[, csum := cumsum(value), id][]
The above will:
Convert the data.frame to a data.table by reference
Calculate the cumulative sum of value grouped by id and assign it by reference
Print (the last [] there) the result of the entire operation
"df" will now be a data.table with a "csum" column.
Using dplyr::
require(dplyr)
df %>% group_by(id) %>% mutate(csum = cumsum(value))
Using library plyr.
library(plyr)
ddply(df,.(id),transform,csum=cumsum(value))
Using base R
df <- data.frame(id = rep(1:3, each = 5)
, hour = rep(1:5, 3)
, value = sample(1:15))
transform(df , csum = ave(value , id , FUN = cumsum))
#> id hour value csum
#> 1 1 1 4 4
#> 2 1 2 12 16
#> 3 1 3 13 29
#> 4 1 4 6 35
#> 5 1 5 5 40
#> 6 2 1 15 15
#> 7 2 2 1 16
#> 8 2 3 2 18
#> 9 2 4 8 26
#> 10 2 5 9 35
#> 11 3 1 11 11
#> 12 3 2 7 18
#> 13 3 3 10 28
#> 14 3 4 3 31
#> 15 3 5 14 45
Created on 2022-06-05 by the reprex package (v2.0.1)