I have this data frame
structure(list(rule.id = c(1, 2), rules = structure(1:2, .Label = c("Lamp1.1,Lamp1.2",
"Lamp2.1,Lamp2.2"), class = "factor")), .Names = c("rule.id",
"rules"), row.names = c(NA, -2L), class = "data.frame")
# rule.id rules
#1 1 Lamp1.1,Lamp1.2
#2 2 Lamp2.1,Lamp2.2
which I need to split on the "rules" column by separator comma (","), multiple commas occur (not only 2 like in example) and then transform this into a normalized format with keeping the relevant rule.id value from the original df.
The result should look like this:
structure(list(rule.id = c(1, 1, 2, 2), lhs = c("Lamp1.1", "Lamp1.2",
"Lamp2.1", "Lamp2.1")), .Names = c("rule.id", "lhs"), row.names = c(NA,
-4L), class = "data.frame")
# rule.id lhs
#1 1 Lamp1.1
#2 1 Lamp1.2
#3 2 Lamp2.1
#4 2 Lamp2.1
I have a code that takes care of the str split and normalize (long) format, but not sure how to take care of the rule.id requirement
lhs.norm <- as.data.frame(
cbind(
rules.df$ruleid,
unlist(strsplit(
unlist(lapply(strsplit(unlist(lapply(as.character(rules.df$rules),function(x) substr(x,2,nchar(x)))), "} =>", fixed = T), function(x) x[1]))
,","))))
thanks to #acrun solution using
cSplit(rules.df.lhs, "lhs", ",", "long"))
I benchmarked 19 seconds for 1M rows (result was around 2M rows)
We can use cSplit from splitstackshape
library(splitstackshape)
cSplit(df, "rules", ",", "long")
# rule.id rules
#1: 1 Lamp1.1
#2: 1 Lamp1.2
#3: 2 Lamp2.1
#4: 2 Lamp2.2
If this is a huge dataset, we can use stringi to split
library(stringi)
lst <- stri_split_fixed(df$rules, ",")
df2 <- data.frame(rule.id = rep(df$rule.id, lengths(lst)),
rules = unlist(lst))
df2
# rule.id rules
#1 1 Lamp1.1
#2 1 Lamp1.2
#3 2 Lamp2.1
#4 2 Lamp2.2
Another option is data.table
library(data.table)
setDT(df)[, strsplit(as.character(rules), ","), by = rule.id]
With the new base pipes we can make #akrun's great solution using stringi::stri_split_fixed even faster. This also exploits recycling of the rule.id column.
stringi::stri_split_fixed(d$rules, ",") |>
unlist() |>
cbind(d[1])
# unlist(stringi::stri_split_fixed(d$rules, ",")) rule.id
# 1 Lamp1.1 1
# 2 Lamp1.2 2
# 3 Lamp2.1 1
# 4 Lamp2.2 2
Benchmark
sapply(c('splitstackshape', 'stringi', 'data.table', 'reshape2'),
library, character.only=TRUE)
dl <- data.frame(rule.id=1:1e6, rules=d$rules)
microbenchmark::microbenchmark(
melt=cbind(dl[1], do.call(rbind, strsplit(as.character(dl$rules), ',', fixed=T))) |>
reshape2::melt('rule.id'),
cbind=stri_split_fixed(dl$rules, ",") |>
unlist() |>
cbind(dl[1]),
dtable=as.data.table(dl)[, strsplit(as.character(rules), ","), by = rule.id],
cSplit=cSplit(dl, "rules", ",", "long"),
stringi={lst <- stri_split_fixed(dl$rules, ",")
data.frame(rule.id = rep(dl$rule.id, lengths(lst)),
rules = unlist(lst))}, times=3L)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# melt 1472.5459 1518.5649 1608.8124 1564.5838 1676.9456 1789.3075 3 b
# cbind 335.7105 365.9372 380.9120 396.1639 403.5128 410.8617 3 a
# dtable 10414.8486 10605.5725 10674.1134 10796.2965 10803.7458 10811.1951 3 d
# cSplit 3003.0660 3079.3098 3232.6108 3155.5537 3347.3832 3539.2128 3 c
# stringi 421.1481 469.1054 518.9577 517.0627 567.8626 618.6624 3 a
# Warning messages:
# 1: In type.convert.default(unlist(x, use.names = FALSE)) :
# 'as.is' should be specified by the caller; using TRUE
# 2: In type.convert.default(unlist(x, use.names = FALSE)) :
# 'as.is' should be specified by the caller; using TRUE
# 3: In type.convert.default(unlist(x, use.names = FALSE)) :
# 'as.is' should be specified by the caller; using TRUE
Note: The warnings stem from cSplit() which code probably wasn't updated for a long time.
Data
d <- structure(list(rule.id = c(1, 2), rules = structure(1:2, .Label = c("Lamp1.1,Lamp1.2",
"Lamp2.1,Lamp2.2"), class = "factor")), .Names = c("rule.id",
"rules"), row.names = c(NA, -2L), class = "data.frame")
Related
I am looking for the most efficient form to transform
ARTNR FILGRP
1 1 9827
2 2 9348
3 3 9335, 9827, 9339
into this
ARTNR FILGRP
1 1 9827
2 2 9348
3 3 9335
4 3 9827
5 3 9339
I tried the following code and it works, but it is not elegant and has some shortcomings. :
setDT(artnrs)
artnrs[, c("P1", "P2", "P3") := tstrsplit(FILGRP, ",", fixed=TRUE)] # 1)
artnrs <- melt(artnrs, c("ARTNR"), measure = patterns("^P")) # 2)
artnrs[,variable:=NULL] # 3)
artnrs <- na.omit(artnrs, cols="value") # 4)
names(artnrs)[2] <- "FILGRP" # 5)
ad 1) splits the last column in three new ones. How can I make this dynamic and make it fit for five or ten?
ad 2-5) rather clumpsy operations, could I chain this better?
It is based on data.table but performance is not that critical so an easy to understand tidyverse solution would be ok. But the fewer packages, the better.
Thanks!
dput output;
structure(list(ARTNR = c(1, 2, 3), FILGRP = c("9827", "9348", "9335, 9827, 9339")),
row.names = c(NA, -3L), class = "data.frame")
df <- structure(list(ARTNR = c(1, 2, 3), FILGRP = c("9827", "9348", "9335, 9827, 9339")),
row.names = c(NA, -3L), class = "data.frame")
df2 <- strsplit(df$FILGRP, split = ",")
df2 <- data.frame(ARTNR = rep(df$ARTNR, sapply(df2, length)), FILGRP = unlist(df2))
here is a data.table approach
library( data.table )
setDT(DT)
melt( DT[, paste0( "v", 1:length(tstrsplit( DT$FILGRP, ", ") ) ) := tstrsplit( FILGRP, ", ") ],
id.vars = "ARTNR",
measure.vars = patterns( "^v" ),
value.name = "FILGRP" )[!is.na(FILGRP), .SD, .SDcols = c(1,3) ]
# ARTNR FILGRP
# 1: 1 9827
# 2: 2 9348
# 3: 3 9335
# 4: 3 9827
# 5: 3 9339
I am not able to filter based on 2 condition. as1 is a dataframe
as1
da cat
1 2016-06-04 04:05:45 A
2 2016-06-04 04:05:46 B
3 2016-06-04 04:05:45 C
4 2016-06-04 04:05:46 D
as2 <- as1 %>% filter(as.POSIXct("2016-06-04 04:05:45") && cat == "A")
I need below dataframe
as2
da cat
1 2016-06-04 04:05:45 A
Let's make some reproducible data as your question is missing it:
as1 <- read.csv(header = T, text = "
da, cat
2016-06-04 04:05:45,A
2016-06-04 04:05:46,B
2016-06-04 04:05:45,C
2016-06-04 04:05:46,D", stringsAsFactors = FALSE)
Now first thing you want to check is if the column "da" is, in fact, POSIXct.
class(as1$da)
#> [1] "character"
In my sample it is not, so I add an extra line to the dplyr pipe.
library(dplyr)
as2 <- as1 %>%
mutate(da = as.POSIXct(da)) %>% # add only if column isn't POSIXct
filter(da == as.POSIXct("2016-06-04 04:05:45") & cat == "A")
Basically what you did wrong was leaving as.POSIXct("2016-06-04 04:05:45") as the expression. filter evaluates a condition, meaning it only keeps the rows where something is TRUE. Hence to "2016-06-04 04:05:45" you need a test---da == as.POSIXct("2016-06-04 04:05:45").
For why you need & here and not &&, see this answer.
You were almost there This is a possible solution for you. You needed to format the data using lubridate before filtering the data.
# load library
library(dplyr)
# create data
x = data.frame(da = c("2019-10-04 07:05:02","2019-10-04 07:05:03","2019-10-04 07:05:02","2019-10-04 07:05:03","2019-10-04 07:05:04"),
db = c("a","a","c","a","a"), stringsAsFactors = F)
# convert to date time format
x$da = lubridate::ymd_hms(x$da)
# see the structure of data
str(x)
# filter the data
x %>% filter(da <= lubridate::ymd_hms('2019-10-04 07:05:02') & db == 'a' )
# da db
#1 2019-10-04 07:05:02 a
Your data
# Data
x = structure(list(da = structure(c(1464993345, 1464993346, 1464993345, 1464993346), class = c("POSIXct", "POSIXt"), tzone = ""), cat = structure(1:4, .Label = c("A", "B", "C", "D"), class = "factor")), class = "data.frame", row.names = c(NA, -4L))
# convert to date time format
x$da = lubridate::ymd_hms(x$da)
# see the structure of data
str(x)
# filter the data
x %>% filter(da <= lubridate::ymd_hms('2016-06-03 15:35:45') & cat == 'A' )
# da cat
#1 2016-06-03 15:35:45 A
I have two variables a and b
a b
vessel hot
parts
nest NA
best true
neat smooth
I want to replace blank in b with a
la$b[i1] <- ifelse(la$b[i1] == "",la$a[i1],la$b[i1])
But it is not working
We can use data.table. Convert the 'data.frame' to 'data.table' (setDT(df1)), specify the condition in 'i' (b==''), and assign the values of 'a' that corresponds to TRUE values in 'i' to 'b'. It should be fast as we are assigning in place.
library(data.able)
setDT(df1)[b=='', b:= a]
df1
# a b
#1: vessel hot
#2: parts parts
#3: nest NA
#4: best true
#5: neat smooth
Or we can just base R
i1 <- df1$b=='' & !is.na(df1$b)
df1$b[i1] <- df1$a[i1]
data
df1 <- structure(list(a = c("vessel", "parts", "nest", "best", "neat"
), b = c("hot", "", NA, "true", "smooth")), .Names = c("a", "b"
), class = "data.frame", row.names = c(NA, -5L))
instead of
# la$b[i1] <- ifelse(la$b[i1] == "",la$a[i1],la$b[i1])
# what is i1? it doesn't seem to have any obvious function here
... it should be:
la$b <- ifelse(la$b == "", la$a, la$b)
assuming that you want to replace blank in b with a and that applies to all blanks
it works:
df <- structure(list(a = c("vessel", "parts", "nest", "best", "neat"
), b = c("hot", "parts", NA, "true", "smooth")), .Names = c("a",
"b"), row.names = c(NA, -5L), class = "data.frame")
df$b <- ifelse(df$b=="", df$a, df$b)
# or, with `with`: df$b <- with(df, ifelse(b=="",a,b))
# > df
# a b
# 1 vessel hot
# 2 parts parts
# 3 nest <NA>
# 4 best true
# 5 neat smooth
Someone should have asked this already, but I couldn't find an answer. Say I have:
x = data.frame(q=1,w=2,e=3, ...and many many columns...)
what is the most elegant way to rename an arbitrary subset of columns, whose position I don't necessarily know, into some other arbitrary names?
e.g. Say I want to rename "q" and "e" into "A" and "B", what is the most elegant code to do this?
Obviously, I can do a loop:
oldnames = c("q","e")
newnames = c("A","B")
for(i in 1:2) names(x)[names(x) == oldnames[i]] = newnames[i]
But I wonder if there is a better way? Maybe using some of the packages? (plyr::rename etc.)
With dplyr you would do:
library(dplyr)
df = data.frame(q = 1, w = 2, e = 3)
df %>% rename(A = q, B = e)
# A w B
#1 1 2 3
Or if you want to use vectors, as suggested by #Jelena-bioinf:
library(dplyr)
df = data.frame(q = 1, w = 2, e = 3)
oldnames = c("q","e")
newnames = c("A","B")
df %>% rename_at(vars(oldnames), ~ newnames)
# A w B
#1 1 2 3
L. D. Nicolas May suggested a change given rename_at is being superseded by rename_with:
df %>%
rename_with(~ newnames[which(oldnames == .x)], .cols = oldnames)
# A w B
#1 1 2 3
setnames from the data.tablepackage will work on data.frames or data.tables
library(data.table)
d <- data.frame(a=1:2,b=2:3,d=4:5)
setnames(d, old = c('a','d'), new = c('anew','dnew'))
d
# anew b dnew
# 1 1 2 4
# 2 2 3 5
Note that changes are made by reference, so no copying (even for data.frames!)
Another solution for dataframes which are not too large is (building on #thelatemail answer):
x <- data.frame(q=1,w=2,e=3)
> x
q w e
1 1 2 3
colnames(x) <- c("A","w","B")
> x
A w B
1 1 2 3
Alternatively, you can also use:
names(x) <- c("C","w","D")
> x
C w D
1 1 2 3
Furthermore, you can also rename a subset of the columnnames:
names(x)[2:3] <- c("E","F")
> x
C E F
1 1 2 3
Here is the most efficient way I have found to rename multiple columns using a combination of purrr::set_names() and a few stringr operations.
library(tidyverse)
# Make a tibble with bad names
data <- tibble(
`Bad NameS 1` = letters[1:10],
`bAd NameS 2` = rnorm(10)
)
data
# A tibble: 10 x 2
`Bad NameS 1` `bAd NameS 2`
<chr> <dbl>
1 a -0.840
2 b -1.56
3 c -0.625
4 d 0.506
5 e -1.52
6 f -0.212
7 g -1.50
8 h -1.53
9 i 0.420
10 j 0.957
# Use purrr::set_names() with annonymous function of stringr operations
data %>%
set_names(~ str_to_lower(.) %>%
str_replace_all(" ", "_") %>%
str_replace_all("bad", "good"))
# A tibble: 10 x 2
good_names_1 good_names_2
<chr> <dbl>
1 a -0.840
2 b -1.56
3 c -0.625
4 d 0.506
5 e -1.52
6 f -0.212
7 g -1.50
8 h -1.53
9 i 0.420
10 j 0.957
Update dplyr 1.0.0
The newest dplyr version became more flexible by adding rename_with() where _with refers to a function as input. The trick is to reformulate the character vector newnames into a formula (by ~), so it would be equivalent to function(x) return (newnames).
In my subjective opinion, that is the most elegant dplyr expression.
Update: thanks to #desval, the oldnames vector must be wrapped by all_of to include all its elements:
# shortest & most elegant expression
df %>% rename_with(~ newnames, all_of(oldnames))
A w B
1 1 2 3
Side note:
If you reverse the order, either argument .fn must be specified as .fn is expected before .cols argument:
df %>% rename_with(oldnames, .fn = ~ newnames)
A w B
1 1 2 3
or specify argument .col:
df %>% rename_with(.col = oldnames, ~ newnames)
A w B
1 1 2 3
So I recently ran into this myself, if you're not sure if the columns exist and only want to rename those that do:
existing <- match(oldNames,names(x))
names(x)[na.omit(existing)] <- newNames[which(!is.na(existing))]
Building on #user3114046's answer:
x <- data.frame(q=1,w=2,e=3)
x
# q w e
#1 1 2 3
names(x)[match(oldnames, names(x))] <- newnames
x
# A w B
#1 1 2 3
This won't be reliant on a specific ordering of columns in the x dataset.
You can use a named vector. Below two options (with base R and dplyr).
base R, via subsetting:
x = data.frame(q = 1, w = 2, e = 3)
rename_vec <- c(q = "A", e = "B")
## vector of same length as names(x) which returns NA if there is no match to names(x)
which_rename <- rename_vec[names(x)]
## simple ifelse where names(x) will be renamed for every non-NA
names(x) <- ifelse(is.na(which_rename), names(x), which_rename)
x
#> A w B
#> 1 1 2 3
Or a dplyr option with !!!:
library(dplyr)
rename_vec <- c(A = "q", B = "e") # the names are just the other way round than in the base R way!
x %>% rename(!!!rename_vec)
#> A w B
#> 1 1 2 3
The latter works because the 'big-bang' operator !!! is forcing evaluation of a list or a vector.
?`!!`
!!! forces-splice a list of objects. The elements of the list are
spliced in place, meaning that they each become one single argument.
names(x)[names(x) %in% c("q","e")]<-c("A","B")
This would change all the occurrences of those letters in all names:
names(x) <- gsub("q", "A", gsub("e", "B", names(x) ) )
There are a few answers mentioning the functions dplyr::rename_with and rlang::set_names already. By they are separate. this answer illustrates the differences between the two and the use of functions and formulas to rename columns.
rename_with from the dplyr package can use either a function or a formula
to rename a selection of columns given as the .cols argument. For example passing the function name toupper:
library(dplyr)
rename_with(head(iris), toupper, starts_with("Petal"))
Is equivalent to passing the formula ~ toupper(.x):
rename_with(head(iris), ~ toupper(.x), starts_with("Petal"))
When renaming all columns, you can also use set_names from the rlang package. To make a different example, let's use paste0 as a renaming function. pasteO takes 2 arguments, as a result there are different ways to pass the second argument depending on whether we use a function or a formula.
rlang::set_names(head(iris), paste0, "_hi")
rlang::set_names(head(iris), ~ paste0(.x, "_hi"))
The same can be achieved with rename_with by passing the data frame as first
argument .data, the function as second argument .fn, all columns as third
argument .cols=everything() and the function parameters as the fourth
argument .... Alternatively you can place the second, third and fourth
arguments in a formula given as the second argument.
rename_with(head(iris), paste0, everything(), "_hi")
rename_with(head(iris), ~ paste0(.x, "_hi"))
rename_with only works with data frames. set_names is more generic and can
also perform vector renaming
rlang::set_names(1:4, c("a", "b", "c", "d"))
If the table contains two columns with the same name then the code goes like this,
rename(df,newname=oldname.x,newname=oldname.y)
You can get the name set, save it as a list, and then do your bulk renaming on the string. A good example of this is when you are doing a long to wide transition on a dataset:
names(labWide)
Lab1 Lab10 Lab11 Lab12 Lab13 Lab14 Lab15 Lab16
1 35.75366 22.79493 30.32075 34.25637 30.66477 32.04059 24.46663 22.53063
nameVec <- names(labWide)
nameVec <- gsub("Lab","LabLat",nameVec)
names(labWide) <- nameVec
"LabLat1" "LabLat10" "LabLat11" "LabLat12" "LabLat13" "LabLat14""LabLat15" "LabLat16" "
Sidenote, if you want to concatenate one string to all of the column names, you can just use this simple code.
colnames(df) <- paste("renamed_",colnames(df),sep="")
Lot's of sort-of-answers, so I just wrote the function so you can copy/paste.
rename <- function(x, old_names, new_names) {
stopifnot(length(old_names) == length(new_names))
# pull out the names that are actually in x
old_nms <- old_names[old_names %in% names(x)]
new_nms <- new_names[old_names %in% names(x)]
# call out the column names that don't exist
not_nms <- setdiff(old_names, old_nms)
if(length(not_nms) > 0) {
msg <- paste(paste(not_nms, collapse = ", "),
"are not columns in the dataframe, so won't be renamed.")
warning(msg)
}
# rename
names(x)[names(x) %in% old_nms] <- new_nms
x
}
x = data.frame(q = 1, w = 2, e = 3)
rename(x, c("q", "e"), c("Q", "E"))
Q w E
1 1 2 3
If one row of the data contains the names you want to change all columns to you can do
names(data) <- data[row,]
Given data is your dataframe and row is the row number containing the new values.
Then you can remove the row containing the names with
data <- data[-row,]
This is the function that you need:
Then just pass the x in a rename(X) and it will rename all values that appear and if it isn't in there it won't error
rename <-function(x){
oldNames = c("a","b","c")
newNames = c("d","e","f")
existing <- match(oldNames,names(x))
names(x)[na.omit(existing)] <- newNames[which(!is.na(existing))]
return(x)
}
Many good answers above using specialized packages. This is a simple way of doing it only with base R.
df.rename.cols <- function(df, col2.list) {
tlist <- transpose(col2.list)
names(df)[which(names(df) %in% tlist[[1]])] <- tlist[[2]]
df
}
Here is an example:
df1 <- data.frame(A = c(1, 2), B = c(3, 4), C = c(5, 6), D = c(7, 8))
col.list <- list(c("A", "NewA"), c("C", "NewC"))
df.rename.cols(df1, col.list)
NewA B NewC D
1 1 3 5 7
2 2 4 6 8
I recently built off of #agile bean's answer (using rename_with, formerly rename_at) to build a function which changes column names if they exist in the data frame, such that one can make the column names of heterogeneous data frames match each other when applicable.
The looping can surely be improved, but figured I'd share for posterity.
create example data frame:
x= structure(list(observation_date = structure(c(18526L, 18784L,
17601L), class = c("IDate", "Date")), year = c(2020L, 2021L,
2018L)), sf_column = "geometry", agr = structure(c(id = NA_integer_,
common_name = NA_integer_, scientific_name = NA_integer_, observation_count = NA_integer_,
country = NA_integer_, country_code = NA_integer_, state = NA_integer_,
state_code = NA_integer_, county = NA_integer_, county_code = NA_integer_,
observation_date = NA_integer_, time_observations_started = NA_integer_,
observer_id = NA_integer_, sampling_event_identifier = NA_integer_,
protocol_type = NA_integer_, protocol_code = NA_integer_, duration_minutes = NA_integer_,
effort_distance_km = NA_integer_, effort_area_ha = NA_integer_,
number_observers = NA_integer_, all_species_reported = NA_integer_,
group_identifier = NA_integer_, year = NA_integer_, checklist_id = NA_integer_,
yday = NA_integer_), class = "factor", .Label = c("constant",
"aggregate", "identity")), row.names = c("3", "3.1", "3.2"), class = "data.frame")
function
match_col_names <- function(x){
col_names <- list(date = c("observation_date", "date"),
C = c("observation_count", "count","routetotal"),
yday = c("dayofyear"),
latitude = c("lat"),
longitude = c("lon","long")
)
for(i in seq_along(col_names)){
newname=names(col_names)[i]
oldnames=col_names[[i]]
toreplace = names(x)[which(names(x) %in% oldnames)]
x <- x %>%
rename_with(~newname, toreplace)
}
return(x)
}
apply function
x <- match_col_names(x)
For execution time purposes , I would like to suggest to use data tables structure:
> df = data.table(x = 1:10, y = 3:12, z = 4:13)
> oldnames = c("x","y","z")
> newnames = c("X","Y","Z")
> library(microbenchmark)
> library(data.table)
> library(dplyr)
> microbenchmark(dplyr_1 = df %>% rename_at(vars(oldnames), ~ newnames) ,
+ dplyr_2 = df %>% rename(X=x,Y=y,Z=z) ,
+ data_tabl1= setnames(copy(df), old = c("x","y","z") , new = c("X","Y","Z")),
+ times = 100)
Unit: microseconds
expr min lq mean median uq max neval
dplyr_1 5760.3 6523.00 7092.538 6864.35 7210.45 17935.9 100
dplyr_2 2536.4 2788.40 3078.609 3010.65 3282.05 4689.8 100
data_tabl1 170.0 218.45 368.261 243.85 274.40 12351.7 100
Inspired by a comment from #gsk3 on a question about reshaping data, I started doing a little bit of experimentation with reshaping data where the variable names have character suffixes instead of numeric suffixes.
As an example, I'll load the dadmomw dataset from one of the UCLA ATS Stata learning webpages (see "Example 4" on the webpage).
Here's what the dataset looks like:
library(foreign)
dadmom <- read.dta("https://stats.idre.ucla.edu/stat/stata/modules/dadmomw.dat")
dadmom
# famid named incd namem incm
# 1 1 Bill 30000 Bess 15000
# 2 2 Art 22000 Amy 18000
# 3 3 Paul 25000 Pat 50000
When trying to reshape from this wide format to long, I run into a problem. Here's what I do to reshape the data.
reshape(dadmom, direction="long", idvar=1, varying=2:5,
sep="", v.names=c("name", "inc"), timevar="dadmom",
times=c("d", "m"))
# famid dadmom name inc
# 1.d 1 d 30000 Bill
# 2.d 2 d 22000 Art
# 3.d 3 d 25000 Paul
# 1.m 1 m 15000 Bess
# 2.m 2 m 18000 Amy
# 3.m 3 m 50000 Pat
Note the swapped column names for "name" and "inc"; changing v.names to c("inc", "name") doesn't solve the problem.
reshape seems very picky about wanting the columns to be named in a fairly standard way. For example, I can reshape the data correctly (and easily) if I first rename the columns:
dadmom2 <- dadmom # Just so we can continue experimenting with the original data
# Change the names of the last four variables to include a "."
names(dadmom2)[2:5] <- gsub("(d$|m$)", "\\.\\1", names(dadmom2)[2:5])
reshape(dadmom2, direction="long", idvar=1, varying=2:5,
timevar="dadmom")
# famid dadmom name inc
# 1.d 1 d Bill 30000
# 2.d 2 d Art 22000
# 3.d 3 d Paul 25000
# 1.m 1 m Bess 15000
# 2.m 2 m Amy 18000
# 3.m 3 m Pat 50000
My questions are:
Why is R swapping the columns in the example I've provided?
Can I get to this result with base R reshape without changing the variable names before reshaping?
Are there other approaches that could be considered instead of reshape?
This works (to specify to varying what columns go with who):
reshape(dadmom, direction="long", varying=list(c(2, 4), c(3, 5)),
sep="", v.names=c("name", "inc"), timevar="dadmom",
times=c("d", "m"))
So you actually have nested repeated measures here; both name and inc for mom and dad. Because you have more than one series of repeated measures you have to supply a list to varying that tells reshape which group gets stacked on the other group.
So the two approaches to this problem are to provide a list as I did or to rename the columns the way the R beast likes them as you did.
See my recent blogs on base reshape for more on this (particularly the second link deals with this):
reshape (part I)
reshape (part II)
Though this question was specifically about base R, it is useful to know other approaches that help you to achieve the same type of outcome.
One alternative to reshape or merged.stack would be to use a combination of "dplyr" and "tidry", like this:
dadmom %>%
gather(variable, value, -famid) %>% ## Make the entire dataset long
separate(variable, into = c("var", "time"), ## Split "variable" column into two...
sep = "(?<=name|inc)", perl = TRUE) %>% ## ... using regex to split the values
spread(var, value, convert = TRUE) ## Make result wide, converting type
# famid time inc name
# 1 1 d 30000 Bill
# 2 1 m 15000 Bess
# 3 2 d 22000 Art
# 4 2 m 18000 Amy
# 5 3 d 25000 Paul
# 6 3 m 50000 Pat
Another alternative would be to use melt from "data.table", like this:
library(data.table)
melt(as.data.table(dadmom), ## melt here requres a data.table
measure = patterns("name", "inc"), ## identify columns by patterns
value.name = c("name", "inc"))[ ## specify the resulting variable names
## melt creates a numeric "variable" value. Replace with factored labels
, variable := factor(variable, labels = c("d", "m"))][]
# famid variable name inc
# 1: 1 d Bill 30000
# 2: 2 d Art 22000
# 3: 3 d Paul 25000
# 4: 1 m Bess 15000
# 5: 2 m Amy 18000
# 6: 3 m Pat 50000
How do these approaches compare with merged.stack?
Both packages are much better supported. They update and test their code more extensively than I do.
melt is blazing fast.
The Hadleyverse approach is actually slower (in many of my tests, even slower than base R's reshape) probably because of having to make the data long, then wide, then performing type conversion. However, some users like its step-by-step approach.
The Hadleyverse approach might have some unintended consequences because of the requirement of making the data long before making it wide. That forces all of the measure columns to be coerced to the same type (usually "character") if they are of different types to begin with.
Neither have the same convenience of merged.stack. Just look at the code required to get the result ;-)
merged.stack, however, can probably benefit from a simplified update, something along the lines of this function
ReshapeLong_ <- function(indt, stubs, sep = NULL) {
if (!is.data.table(indt)) indt <- as.data.table(indt)
mv <- lapply(stubs, function(y) grep(sprintf("^%s", y), names(indt)))
levs <- unique(gsub(paste(stubs, collapse="|"), "", names(indt)[unlist(mv)]))
if (!is.null(sep)) levs <- gsub(sprintf("^%s", sep), "", levs, fixed = TRUE)
melt(indt, measure = mv, value.name = stubs)[
, variable := factor(variable, labels = levs)][]
}
Which can then be used as:
ReshapeLong_(dadmom, stubs = c("name", "inc"))
How do these approaches compare with base R's reshape?
The main difference is that reshape is not able to handle unbalanced panel datasets. See, for example, "mydf2" as opposed to "mydf" in the tests below.
Test cases
Here's some sample data. "mydf" is balanced. "mydf2" is not balanced.
set.seed(1)
x <- 10000
mydf <- mydf2 <- data.frame(
id_1 = 1:x, id_2 = c("A", "B"), varAa = sample(letters, x, TRUE),
varAb = sample(letters, x, TRUE), varAc = sample(letters, x, TRUE),
varBa = sample(10, x, TRUE), varBb = sample(10, x, TRUE),
varBc = sample(10, x, TRUE), varCa = rnorm(x), varCb = rnorm(x),
varCc = rnorm(x), varDa = rnorm(x), varDb = rnorm(x), varDc = rnorm(x))
mydf2 <- mydf2[-c(9, 14)] ## Make data unbalanced
Here are some functions to test:
f1 <- function(mydf) {
mydf %>%
gather(variable, value, starts_with("var")) %>%
separate(variable, into = c("var", "time"),
sep = "(?<=varA|varB|varC|varD)", perl = TRUE) %>%
spread(var, value, convert = TRUE)
}
f2 <- function(mydf) {
melt(as.data.table(mydf),
measure = patterns(paste0("var", c("A", "B", "C", "D"))),
value.name = paste0("var", c("A", "B", "C", "D")))[
, variable := factor(variable, labels = c("a", "b", "c"))][]
}
f3 <- function(mydf) {
merged.stack(mydf, var.stubs = paste0("var", c("A", "B", "C", "D")), sep = "var.stubs")
}
## Won't run with "mydf2". Should run with "mydf"
f4 <- function(mydf) {
reshape(mydf, direction = "long",
varying = lapply(c("varA", "varB", "varC", "varD"),
function(x) grep(x, names(mydf))),
sep = "", v.names = paste0("var", c("A", "B", "C", "D")),
timevar="time", times = c("a", "b", "c"))
}
Test performance:
library(microbenchmark)
microbenchmark(f1(mydf), f2(mydf), f3(mydf), f4(mydf))
# Unit: milliseconds
# expr min lq mean median uq max neval
# f1(mydf) 463.006547 492.073086 528.533319 514.189548 538.910756 867.93356 100
# f2(mydf) 3.737321 4.108376 6.674066 4.332391 4.761681 47.71142 100
# f3(mydf) 60.211254 64.766770 86.812077 87.040087 92.841747 262.89409 100
# f4(mydf) 40.596455 43.753431 61.006337 48.963145 69.983623 230.48449 100
Observations:
Base R's reshape would not be able to handle reshaping "mydf2".
The "dplyr" + "tidyr" approach would mangle the results in the resulting "varB", "varC", and "varD" because values would be coerced to character.
As the benchmarks show, reshape gives reasonable performance.
Note: Because of the difference in time between posting my last answer and the differences in approach, I thought I would share this as a new answer.
merged.stack from my "splitstackshape" handles this by utilizing the sep = "var.stubs" construct:
library(splitstackshape)
merged.stack(dadmom, var.stubs = c("inc", "name"), sep = "var.stubs")
# famid .time_1 inc name
# 1: 1 d 30000 Bill
# 2: 1 m 15000 Bess
# 3: 2 d 22000 Art
# 4: 2 m 18000 Amy
# 5: 3 d 25000 Paul
# 6: 3 m 50000 Pat
Notice that since there is no real separator in the variables that are being stacked, we can just strip out the var.stubs from the names to create the "time" variables. Using sep = "var.stubs" is equivalent to doing sep = "inc|name".
This works because ".time_1" is created by stripping out what is left after removing the "var.stubs" from the column names.