It is a truth universally acknowledged that R's base reshape command is speedy and powerful but has miserable syntax. I have therefore written a quick wrapper around it which I will throw into the next release of the taRifx package. Before I did that, however, I want to solicit improvements.
Here's my version, with updates from #RichieCotton:
# reshapeasy: Version of reshape with way, way better syntax
# Written with the help of the StackOverflow R community
# x is a data.frame to be reshaped
# direction is "wide" or "long"
# vars are the names of the (stubs of) the variables to be reshaped (if omitted, defaults to everything not in id or vary)
# id are the names of the variables that identify unique observations
# vary is the variable that varies. Going to wide this variable will cease to exist. Going to long it will be created.
# omit is a vector of characters which are to be omitted if found at the end of variable names (e.g. price_1 becomes price in long)
# ... are options to be passed to stats::reshape
reshapeasy <- function( data, direction, id=(sapply(data,is.factor) | sapply(data,is.character)), vary=sapply(data,is.numeric), omit=c("_","."), vars=NULL, ... ) {
if(direction=="wide") data <- stats::reshape( data=data, direction=direction, idvar=id, timevar=vary, ... )
if(direction=="long") {
varying <- which(!(colnames(data) %in% id))
data <- stats::reshape( data=data, direction=direction, idvar=id, varying=varying, timevar=vary, ... )
}
colnames(data) <- gsub( paste("[",paste(omit,collapse="",sep=""),"]$",sep=""), "", colnames(data) )
return(data)
}
Note that you can move from wide to long without changing the options other than the direction. To me, this is the key to usability.
I'm happy to give acknowledgement in the function help files for any substantial improvements if you chat or e-mail me your info.
Improvements might fall in the following areas:
Naming the function and its arguments
Making it more general (currently it handles a fairly specific case, which I believe to be by far the most common, but it has not yet exhausted the capabilities of stats::reshape)
Code improvements
Examples
Sample data
x.wide <- structure(list(surveyNum = 1:6, pio_1 = structure(c(2L, 2L, 1L,
2L, 1L, 1L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2"), class = "factor"), pio_2 = structure(c(2L, 1L, 2L, 1L,
2L, 2L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2"), class = "factor"), pio_3 = structure(c(2L, 2L, 1L, 1L,
2L, 1L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2"), class = "factor"), caremgmt_1 = structure(c(2L, 1L, 1L,
2L, 1L, 2L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2"), class = "factor"), caremgmt_2 = structure(c(1L, 2L, 2L,
2L, 2L, 1L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2"), class = "factor"), caremgmt_3 = structure(c(1L, 2L, 1L,
2L, 1L, 1L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2"), class = "factor"), prev_1 = structure(c(1L, 2L, 2L, 1L,
1L, 2L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2"), class = "factor"), prev_2 = structure(c(2L, 2L, 1L, 2L,
1L, 1L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2"), class = "factor"), prev_3 = structure(c(2L, 1L, 2L, 2L,
1L, 1L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2"), class = "factor"), price_1 = structure(c(2L, 1L, 2L, 5L,
3L, 4L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2", "3", "4", "5", "6"), class = "factor"), price_2 = structure(c(6L,
5L, 5L, 4L, 4L, 2L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2", "3", "4", "5", "6"), class = "factor"), price_3 = structure(c(3L,
5L, 2L, 5L, 4L, 5L), .Names = c("1", "2", "3", "4", "5", "6"), .Label = c("1",
"2", "3", "4", "5", "6"), class = "factor")), .Names = c("surveyNum",
"pio_1", "pio_2", "pio_3", "caremgmt_1", "caremgmt_2", "caremgmt_3",
"prev_1", "prev_2", "prev_3", "price_1", "price_2", "price_3"
), idvars = "surveyNum", rdimnames = list(structure(list(surveyNum = 1:24), .Names = "surveyNum", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24"
), class = "data.frame"), structure(list(variable = structure(c(1L,
1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L), .Label = c("pio",
"caremgmt", "prev", "price"), class = "factor"), .id = c(1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L)), .Names = c("variable",
".id"), row.names = c("pio_1", "pio_2", "pio_3", "caremgmt_1",
"caremgmt_2", "caremgmt_3", "prev_1", "prev_2", "prev_3", "price_1",
"price_2", "price_3"), class = "data.frame")), row.names = c(NA,
6L), class = c("cast_df", "data.frame"))
x.long <- structure(list(.id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), pio = structure(c(2L,
2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L,
1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L,
1L, 2L, 2L, 1L, 2L, 1L, 1L), .Label = c("1", "2"), class = "factor"),
caremgmt = structure(c(2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L,
1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L,
1L, 2L, 2L), .Label = c("1", "2"), class = "factor"), prev = structure(c(1L,
2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L,
2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L,
2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L), .Label = c("1",
"2"), class = "factor"), price = structure(c(2L, 1L, 2L,
5L, 3L, 4L, 1L, 5L, 4L, 3L, 1L, 2L, 6L, 6L, 5L, 4L, 6L, 3L,
5L, 6L, 3L, 1L, 2L, 4L, 3L, 5L, 2L, 5L, 4L, 5L, 6L, 6L, 4L,
6L, 4L, 1L, 2L, 3L, 1L, 2L, 2L, 5L, 1L, 6L, 1L, 3L, 4L, 3L,
6L, 5L, 5L, 4L, 4L, 2L, 2L, 2L, 6L, 3L, 1L, 4L, 4L, 5L, 1L,
3L, 6L, 1L, 3L, 5L, 1L, 3L, 6L, 2L), .Label = c("1", "2",
"3", "4", "5", "6"), class = "factor"), surveyNum = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L,
16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L,
17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L)), .Names = c(".id",
"pio", "caremgmt", "prev", "price", "surveyNum"), row.names = c(NA,
-72L), class = "data.frame")
Examples
> x.wide
surveyNum pio_1 pio_2 pio_3 caremgmt_1 caremgmt_2 caremgmt_3 prev_1 prev_2 prev_3 price_1 price_2 price_3
1 1 2 2 2 2 1 1 1 2 2 2 6 3
2 2 2 1 2 1 2 2 2 2 1 1 5 5
3 3 1 2 1 1 2 1 2 1 2 2 5 2
4 4 2 1 1 2 2 2 1 2 2 5 4 5
5 5 1 2 2 1 2 1 1 1 1 3 4 4
6 6 1 2 1 2 1 1 2 1 1 4 2 5
> reshapeasy( x.wide, "long", NULL, id="surveyNum", vary="id", sep="_" )
surveyNum id pio caremgmt prev price
1.1 1 1 2 2 1 2
2.1 2 1 2 1 2 1
3.1 3 1 1 1 2 2
4.1 4 1 2 2 1 5
5.1 5 1 1 1 1 3
6.1 6 1 1 2 2 4
1.2 1 2 2 1 2 6
2.2 2 2 1 2 2 5
3.2 3 2 2 2 1 5
4.2 4 2 1 2 2 4
5.2 5 2 2 2 1 4
6.2 6 2 2 1 1 2
1.3 1 3 2 1 2 3
2.3 2 3 2 2 1 5
3.3 3 3 1 1 2 2
4.3 4 3 1 2 2 5
5.3 5 3 2 1 1 4
6.3 6 3 1 1 1 5
> head(x.long)
.id pio caremgmt prev price surveyNum
1 1 2 2 1 2 1
2 1 2 1 2 1 2
3 1 1 1 2 2 3
4 1 2 2 1 5 4
5 1 1 1 1 3 5
6 1 1 2 2 4 6
> head(reshapeasy( x.long, direction="wide", id="surveyNum", vary=".id" ))
surveyNum pio.1 caremgmt.1 prev.1 price.1 pio.3 caremgmt.3 prev.3 price.3 pio.2 caremgmt.2 prev.2 price.2
1 1 2 2 1 2 2 1 2 3 2 1 2 6
2 2 2 1 2 1 2 2 1 5 1 2 2 5
3 3 1 1 2 2 1 1 2 2 2 2 1 5
4 4 2 2 1 5 1 2 2 5 1 2 2 4
5 5 1 1 1 3 2 1 1 4 2 2 1 4
6 6 1 2 2 4 1 1 1 5 2 1 1 2
I would also like to see an option to order the output, since that's one of the things I don't like about reshape in base R. As an example, let's use the Stata Learning Module: Reshaping data wide to long, which you are already familiar with. The example I'm looking at is the "kids height and weight at age 1 and age 2" example.
Here's what I normally do with reshape():
# library(foreign)
kidshtwt = read.dta("http://www.ats.ucla.edu/stat/stata/modules/kidshtwt.dta")
kidshtwt.l = reshape(kidshtwt, direction="long", idvar=1:2,
varying=3:6, sep="", timevar="age")
# The reshaped data is correct, just not in the order I want it
# so I always have to do another step like this
kidshtwt.l = kidshtwt.l[order(kidshtwt.l$famid, kidshtwt.l$birth),]
Since this is an annoying step that I always have to go through when reshaping the data, I think it would be useful to add that into your function.
I also suggest at least having an option for doing the same thing with the final column order for reshaping from long to wide.
Example function for column ordering
I'm not sure of the best way to integrate this into your function, but I put this together to sort a data frame based on basic patterns for the variable names.
col.name.sort = function(data, patterns) {
a = names(data)
b = length(patterns)
subs = vector("list", b)
for (i in 1:b) {
subs[[i]] = sort(grep(patterns[i], a, value=T))
}
x = unlist(subs)
data[ , x ]
}
It can be used in the following manner. Imagine we had saved the output of your reshapeasy long to wide example as a data frame named a, and we wanted it ordered by "surveyNum", "caremgmt" (1-3), "prev" (1-3), "pio" (1-3), and "price" (1-3), we could use:
col.name.sort(a, c("sur", "car", "pre", "pio", "pri"))
Some initial thoughts:
I've always thought that the direction commands "wide" and "long" were a little fuzzy. Do they mean you want to convert the data to that format, or that the data is already in that format? It is something that you need to learn or look up. You can avoid that problem by having to separate functions reshapeToWide and reshapeToLong. As a bonus, the signature of each function has one less argument.
I don't think you meant to include the line
varying <- which(!(colnames(x.wide) %in% "surveyNum"))
since it refers to a specific dataset.
I prefer data to x for the first argument since it makes it clear that the input should be a data frame.
It is generally better form to have arguments without defaults first. So vars should come after id and vary.
Can you pick defaults for id and vary? reshape::melt defaults to factor and character columns for id and numeric columns for vary.
I think there might be a mistake in your example. For going from wide to long, I get the following error:
> reshapeasy( x.wide, "long", NULL, id="surveyNum", vary="id", sep="_" )
Error in gsub(paste("[", paste(omit, collapse = "", sep = ""), "]$", sep = ""), :
invalid regular expression '[]$', reason 'Missing ']''
Removing the NULL corrects the problem. Which leads me to ask, what is the intended purpose of that NULL?
I also think that the function would be improved if it generated a time variable by default, if not explicitly specified by the user (as is done in reshape()).
See, for instance, the following from base reshpae():
> head(reshape(x.wide, direction="long", idvar=1, varying=2:13, sep="_"))
surveyNum time pio caremgmt prev price
1.1 1 1 2 2 1 2
2.1 2 1 2 1 2 1
3.1 3 1 1 1 2 2
4.1 4 1 2 2 1 5
5.1 5 1 1 1 1 3
6.1 6 1 1 2 2 4
If I'm familiar with this, and I see that your function takes care of "varying" for me, I might be tempted to try:
> head(reshapeasy( x.wide, "long", id="surveyNum", sep="_" ))
Error in `row.names<-.data.frame`(`*tmp*`, value = paste(d[, idvar], times[1L], :
duplicate 'row.names' are not allowed
In addition: Warning message:
non-unique value when setting 'row.names': ‘1.1’
But that's not a very useful error. Perhaps including a custom error message might be useful for your final function.
Allowing the user to set vary to NULL, as you have done in your present version of the function, also doesn't seem wise to me. This yields output like this:
> head(reshapeasy( x.wide, "long", id="surveyNum", NULL, sep="_" ))
surveyNum pio caremgmt prev price
1.1 1 2 2 1 2
2.1 2 2 1 2 1
3.1 3 1 1 2 2
4.1 4 2 2 1 5
5.1 5 1 1 1 3
6.1 6 1 2 2 4
The problem with this output is that if I needed to reshape back to wide, I can't do it easily. Thus, I think that retaining reshape's default option of generating a time variable, but letting the user override that might be a useful feature.
Perhaps for those who are lazy and don't like to type the variable names, you can add the following to the head of your function:
if (is.numeric(id) == 1) {
id = colnames(data)[id]
} else if (is.numeric(id) == 0) {
id = id
}
if (is.numeric(vary) == 1) {
vary = colnames(data)[vary]
} else if (is.numeric(vary) == 0) {
vary = vary
}
Then, following with your examples, you can use the following shorthand:
reshapeasy(x.wide, direction="long", id=1, sep="_", vary="id")
reshapeasy(x.long, direction="wide", id=6, vary=1)
(I know, it might not be good practice since the code might be less readable or less easily understandable by someone later on, but it does happen frequently.)
Related
I have a data frame of 2511 rows and 6 columns with candy and color items. Please see the first 15 rows as below:
structure(list(x = 1:15, iteml = structure(c(2L, 1L, 1L, 1L,
5L, 4L, 4L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("{dulce1_rojo",
"{dulce2_verde", "{dulce7_plata", "{miel21_amarillo", "{miel30_azul"
), class = "factor"), item2 = structure(c(4L, 2L, 2L, 2L, 1L,
5L, 5L, 4L, 3L, 3L, 4L, 1L, 4L, 4L, 1L), .Label = c("chocolate2l_amarillo",
"dulce2_verde", "dulce7_plata", "miel21_amarillo", "miel30_azul"
), class = "factor"), item3 = structure(c(1L, 1L, 3L, 3L, 2L,
2L, 1L, 2L, 2L, 3L, 2L, 2L, 2L, 1L, 2L), .Label = c("chocolate2l_amarillo",
"chocolate30_azul", "miel21_amarillo"), class = "factor"), item4 = structure(c(2L,
2L, 2L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("chocolate2l_amarillo",
"chocolate32_violeta", "cookie30_azul"), class = "factor"), item5 = structure(c(2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("cookie2l_amarillo}",
"cookie32_violeta}"), class = "factor"), item6 = structure(c(4L,
6L, 1L, 3L, 6L, 1L, 2L, 4L, 6L, 2L, 5L, 6L, 1L, 2L, 4L), .Label = c(">{chocolate2l_amarillo}",
">{chocolate30_azul}", ">{chocolate32_violeta}", ">{dulce1_rojo}",
">{dulce7_plata}", ">{miel21_amarillo}"), class = "factor")), class = "data.frame", row.names = c(NA,
-15L))
I don`t know how can I count in new columns only the kind of candy that each row has. This first line as an expected ouput of the resulting data frame:
x iteml item2 item3 item4 item5 item6 dulce miel chocolate cookie
1 1 {dulce2_verde miel21_amarillo chocolate2l_amarillo chocolate32_violeta cookie32_violeta} >{dulce1_rojo} 2 1 2 1
I'm stuck and I'd appreciate a little help.
you can use apply function to apply grepl function by row for the initial data frame. Then you use sapply to iterate through four ingridients you indicated. Then use cbind to concatentate the initial data frame and the data frame with ingedients into one. Please see the code below:
# initialize data frame
df <- structure(list(x = 1:15, iteml = structure(c(2L, 1L, 1L, 1L,
5L, 4L, 4L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("{dulce1_rojo",
"{dulce2_verde", "{dulce7_plata", "{miel21_amarillo", "{miel30_azul"
), class = "factor"), item2 = structure(c(4L, 2L, 2L, 2L, 1L,
5L, 5L, 4L, 3L, 3L, 4L, 1L, 4L, 4L, 1L), .Label = c("chocolate2l_amarillo",
"dulce2_verde", "dulce7_plata", "miel21_amarillo", "miel30_azul"
), class = "factor"), item3 = structure(c(1L, 1L, 3L, 3L, 2L,
2L, 1L, 2L, 2L, 3L, 2L, 2L, 2L, 1L, 2L), .Label = c("chocolate2l_amarillo",
"chocolate30_azul", "miel21_amarillo"), class = "factor"), item4 = structure(c(2L,
2L, 2L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("chocolate2l_amarillo",
"chocolate32_violeta", "cookie30_azul"), class = "factor"), item5 = structure(c(2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("cookie2l_amarillo}",
"cookie32_violeta}"), class = "factor"), item6 = structure(c(4L,
6L, 1L, 3L, 6L, 1L, 2L, 4L, 6L, 2L, 5L, 6L, 1L, 2L, 4L), .Label = c(">{chocolate2l_amarillo}",
">{chocolate30_azul}", ">{chocolate32_violeta}", ">{dulce1_rojo}",
">{dulce7_plata}", ">{miel21_amarillo}"), class = "factor")), class = "data.frame", row.names = c(NA,
-15L))
# counting ingridients
ingridients <- c("dulce", "miel", "chocolate", "cookie")
x <- sapply(ingridients, function(y) apply(df, 1, function(x) sum(grepl(y, x))))
df_res <- cbind(df, x)
head(df_res)
Output:
x iteml item2 item3 item4 item5 item6 dulce miel chocolate cookie
1 1 {dulce2_verde miel21_amarillo chocolate2l_amarillo chocolate32_violeta cookie32_violeta} >{dulce1_rojo} 2 1 2 1
2 2 {dulce1_rojo dulce2_verde chocolate2l_amarillo chocolate32_violeta cookie32_violeta} >{miel21_amarillo} 2 1 2 1
3 3 {dulce1_rojo dulce2_verde miel21_amarillo chocolate32_violeta cookie32_violeta} >{chocolate2l_amarillo} 2 1 2 1
4 4 {dulce1_rojo dulce2_verde miel21_amarillo chocolate2l_amarillo cookie32_violeta} >{chocolate32_violeta} 2 1 2 1
5 5 {miel30_azul chocolate2l_amarillo chocolate30_azul cookie30_azul cookie2l_amarillo} >{miel21_amarillo} 0 2 2 2
6 6 {miel21_amarillo miel30_azul chocolate30_azul cookie30_azul cookie2l_amarillo} >{chocolate2l_amarillo} 0 2 2 2
I am working with a data set of changes over time and need to calculate the time at which the peak change occurs. I am running into a problem because some subjects have missing data (NA's).
Example:
library(dplyr)
Data <- structure(list(Subject = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L), .Label = c("1", "10", "11", "12", "13", "14", "16",
"17", "18", "19", "2", "20", "21", "22", "23", "24", "25", "26",
"27", "28", "29", "3", "31", "32", "4", "5", "7", "8", "9"), class = "factor"),
Close = structure(c(1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L,
2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L
), .Label = c("High Predictability", "Low Predictability"
), class = "factor"), SOA = structure(c(2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L), .Label = c("Long SOA", "Short SOA"), class = "factor"),
Time = c(-66.68, -66.68, -66.68, -66.68, -33.34, -33.34,
-33.34, -33.34, 0, 0, 0, 0, 33.34, 33.34, 33.34, 33.34, 66.68,
66.68, 66.68, 66.68, -66.68, -66.68, -66.68, -66.68, -33.34,
-33.34, -33.34, -33.34, 0, 0, 0, 0, 33.34, 33.34, 33.34,
33.34, 66.68, 66.68, 66.68, 66.68), Pcent_Chng = c(0.12314,
0.048254, -0.098007, 0.023216, 0.20327, 0.08338, -0.15157,
0.030008, 0.26442, 0.12019, -0.22878, 0.035547, 0.31849,
0.15488, -0.26887, 0.038992, 0.39489, 0.15112, -0.31185,
0.02144, NA, 0.046474, NA, 0.17541, NA, 0.14975, NA, 0.3555,
NA, -0.1736, NA, 0.72211, NA, -0.32201, NA, 1.0926, NA, -0.39551,
0.72211, 1.4406)), class = "data.frame", row.names = c(NA, -40L
), .Names = c("Subject", "Close", "SOA", "Time", "Pcent_Chng"
))
I get an error with the following attempt:
Data %>%
group_by(Subject,Close,SOA) %>%
summarize(Peak_Pcent = max(Pcent_Chng),
Peak_Latency = Time[which.max(Pcent_Chng)])
The error is:
Error in summarise_impl(.data, dots) :
Column `Peak_Latency` must be length 1 (a summary value), not 0
This seems to be due to the NA's, which are only in some SOA conditions. Using complete.cases() with my actual data is too aggressive and removes too much data.
Is there a workaround to ignore the NA's?
You have one group with Peak_Pcent all is NA, and the other group only with one Peak_Pcent. I think it is better to filter out the group with Peak_Pcent all is NA, and set na.rm = TRUE when using the max function.
Data %>%
group_by(Subject,Close,SOA) %>%
filter(!all(is.na(Pcent_Chng))) %>% # Filter out groups with Pcent_Chng all is NA
summarize(Peak_Pcent = max(Pcent_Chng, na.rm = TRUE), # Set na.rm = TRUE
Peak_Latency = Time[which.max(Pcent_Chng)])
# # A tibble: 7 x 5
# # Groups: Subject, Close [?]
# Subject Close SOA Peak_Pcent Peak_Latency
# <fctr> <fctr> <fctr> <dbl> <dbl>
# 1 1 High Predictability Long SOA 0.154880 33.34
# 2 1 High Predictability Short SOA 0.394890 66.68
# 3 1 Low Predictability Long SOA 0.038992 33.34
# 4 1 Low Predictability Short SOA -0.098007 -66.68
# 5 14 High Predictability Long SOA 0.149750 -33.34
# 6 14 Low Predictability Long SOA 1.440600 66.68
# 7 14 Low Predictability Short SOA 0.722110 66.68
This should do the trick:
Data %>%
group_by(Subject, Close, SOA) %>%
mutate(Peak_Pcent = max(Pcent_Chng)) %>%
arrange(Subject, Close, SOA) %>%
filter(Peak_Pcent == Pcent_Chng)
The output:
# A tibble: 6 x 6
# Groups: Subject, Close, SOA [6]
Subject Close SOA Time Pcent_Chng Peak_Pcent
<fctr> <fctr> <fctr> <dbl> <dbl> <dbl>
1 1 High Predictability Long SOA 33.34 0.154880 0.154880
2 1 High Predictability Short SOA 66.68 0.394890 0.394890
3 1 Low Predictability Long SOA 33.34 0.038992 0.038992
4 1 Low Predictability Short SOA -66.68 -0.098007 -0.098007
5 14 High Predictability Long SOA -33.34 0.149750 0.149750
6 14 Low Predictability Long SOA 66.68 1.440600 1.440600
Originally we were thinking a one-way ANOVA, but it seems like I need to do a Two-way because I have two independent variables. The session (the time that the minnow trap samples were taken) and TRAP (the individual trap (four per hole to be averaged) for each alligator hole in each macrocosm. CPUE would be the dependent variable, and then the ID column.
SESSION TRAP CPUE ID
One M1E1 3 1
One M1E2 0 2
One M1E3 0 3
One M1E4 2 4
One M1W1 0 5
One M1W2 0 6
One M1W3 0 7
One M1W4 0 8
One M2E1 0 9
One M2E2 0 10
One M2E3 0 11
One M2E4 0 12
One M2W1 0 13
One M2W2 1 14
One M2W3 1 15
One M2W4 0 16
One M3E1 5 17
One M3E2 2 18
One M3E3 0 19
One M3E4 3 20
One M3W1 0 21
One M3W2 0 22
One M3W3 0 23
One M3W4 2 24
One M4E1 0 25
One M4E2 1 26
One M4E3 0 27
One M4E4 0 28
One M4W1 0 29
One M4W2 0 30
One M4W3 0 31
One M4W4 8 32
Two M4E1 23 33
Two M4E2 5 34
Two M4E3 0 35
Two M4E4 10 36
Two M4W1 23 37
Two M4W2 7 38
Two M4W3 1 39
Two M4W4 7 40
Two M3E1 6 41
Two M3E2 3 42
Two M3E3 5 43
Two M3E4 10 44
Two M3W1 8 45
Two M3W2 0 46
Two M3W3 1 47
Two M3W4 5 48
Two M2E1 12 49
Two M2E2 15 50
Two M2E3 3 51
Two M2E4 10 52
Two M2W1 5 53
Two M2W2 11 54
Two M2W3 6 55
Two M2W4 4 56
Two M1E1 13 57
Two M1E2 19 58
Two M1E3 3 59
Two M1E4 30 60
Two M1W1 16 61
Two M1W2 2 62
Two M1W3 4 63
Two M1W4 27 64
Three M4E1 0 65
Three M4E2 26 66
Three M4E3 3 67
Three M4E4 13 68
Three M4W1 9 69
Three M4W2 0 70
Three M4W3 4 71
Three M4W4 2 72
Three M3E1 29 73
Three M3E2 0 74
Three M3E3 0 75
Three M3E4 11 76
Three M3W1 27 77
Three M3W2 5 78
Three M3W3 8 79
Three M3W4 3 80
Three M2E1 5 81
Three M2E2 11 82
Three M2E3 62 83
Three M2E4 31 84
Three M2W1 11 85
Three M2W2 1 86
Three M2W3 0 87
Three M2W4 9 88
Three M1E1 48 89
Three M1E2 78 90
Three M1E3 14 91
Three M1E4 7 92
Three M1W1 3 93
Three M1W2 63 94
Three M1W3 43 95
Three M1W4 31 96
I am using this command:
> output = ezANOVA(data = CSV.Repeated.Measures.ANOVA.Minnow._2cm.R.Data.Sheet, dv= CPUE, wid = ID, within = .(SESSION, TRAP), detailed = TRUE, type = 3)
I Get this error message:
Error in ezANOVA_main(data = data, dv = dv, wid = wid, within =
within, : One or more cells is missing data. Try using ezDesign()
to check your data.
I don't know what the exDesign() is trying to tell me either.
I will try to give a solution to your problem with ezANOVA. Of course, it would be necessary to know all the details of your experiment for a complete and correct answer to your question.
If I am not wrong, you wrote that minnow traps are the sample units of the experiment and repeated measures are made on these units (under different experimental conditions). Hence the IDs of the sample units are not those stored in the column ID; a new id variable needs to be generated.
Here is the dataset:
df <- structure(list(SESSION = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("One", "Three",
"Two"), class = "factor"), TRAP = structure(c(1L, 2L, 3L, 4L,
1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L,
1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L,
1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L,
1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L,
1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L,
1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L), .Label = c("1",
"2", "3", "4"), class = "factor"), CPUE = c(3L, 0L, 0L, 2L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 5L, 2L, 0L, 3L, 0L,
0L, 0L, 2L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 8L, 23L, 5L, 0L, 10L,
23L, 7L, 1L, 7L, 6L, 3L, 5L, 10L, 8L, 0L, 1L, 5L, 12L, 15L, 3L,
10L, 5L, 11L, 6L, 4L, 13L, 19L, 3L, 30L, 16L, 2L, 4L, 27L, 0L,
26L, 3L, 13L, 9L, 0L, 4L, 2L, 29L, 0L, 0L, 11L, 27L, 5L, 8L,
3L, 5L, 11L, 62L, 31L, 11L, 1L, 0L, 9L, 48L, 78L, 14L, 7L, 3L,
63L, 43L, 31L), ID = structure(1:96, .Label = c("1", "2", "3",
"4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15",
"16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26",
"27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37",
"38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48",
"49", "50", "51", "52", "53", "54", "55", "56", "57", "58", "59",
"60", "61", "62", "63", "64", "65", "66", "67", "68", "69", "70",
"71", "72", "73", "74", "75", "76", "77", "78", "79", "80", "81",
"82", "83", "84", "85", "86", "87", "88", "89", "90", "91", "92",
"93", "94", "95", "96"), class = "factor"), MACROCOSM = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2", "3", "4"), class = "factor"), HOLE = structure(c(1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("E",
"W"), class = "factor")), .Names = c("SESSION", "TRAP", "CPUE",
"ID", "MACROCOSM", "HOLE"), row.names = c(NA, -96L), class = "data.frame")
and here is the code that (hopefully) should show you the way for finding a solution to your problem:
df$MACROCOSM <- factor(substr(df$TRAP, 2, 2))
df$HOLE <- factor(substr(df$TRAP, 3, 3))
df$TRAP <- factor(substr(df$TRAP, 4, 4))
library(ez)
ezOut <- ezANOVA(data = df,
dv=CPUE, wid = .(TRAP), within = .(SESSION,HOLE,MACROCOSM),
detailed = TRUE, type = 1)
print(ezOut)
#############
$ANOVA
Effect DFn DFd SSn SSd F p p<.05 ges
1 SESSION 2 6 4372.5625 753.35417 17.4123780 0.003174556 * 0.30542230
2 HOLE 1 3 276.7604 56.11458 14.7961760 0.031011624 * 0.02707856
3 MACROCOSM 3 9 2030.5313 1466.76042 4.1530939 0.041961697 * 0.16957246
4 SESSION:HOLE 2 6 216.2708 60.47917 10.7278677 0.010436491 * 0.02128617
5 SESSION:MACROCOSM 6 18 2327.6875 3995.39583 1.7477774 0.167180534 0.18968127
6 HOLE:MACROCOSM 3 9 198.6146 1070.34375 0.5566845 0.656642963 0.01958241
7 SESSION:HOLE:MACROCOSM 6 18 461.4792 2541.43750 0.5447458 0.767574519 0.04435012
I have following data with peculiar missing values situation (all values of vnum1 for vcat1==3 are missing):
> head(mydf)
vnum1 vcat1
1 -0.1624229 1
2 0.2465567 1
3 NA 3
4 0.7067778 2
5 NA 3
6 -0.2241726 4
> dput(mydf)
structure(list(vnum1 = c(-0.162422853864248, 0.246556718176803,
NA, 0.706777793886275, NA, -0.224172615208867, 0.0545850414695318,
NA, NA, -1.94778020954922, 1.89581259201036, 0.901973743223488,
-0.31255172156186, -1.67311124367419, 0.491316838004494, NA,
-0.699315343799762, 0.668020448193884, 1.45492995320554, 1.17747976289091,
-0.65137204397438, 1.78678696473193, 2.58978935829221, NA, 1.26534157843481,
0.629748102812663, 0.246596558590885, 0.968707124353133, 0.108668693948881,
-0.219419917000748, 2.25307417017233, -0.626124211646445, -1.16298694223082,
-1.23524906047676, -2.34636152907898, NA, 0.408667368960836,
0.272596114054819, 0.747455245383144, -0.745843219461836, -0.0966351379737077,
1.44803320811527, -1.5434982335725, -0.782902668540696, -0.448286848257394,
NA, 0.168327130336994, -0.493721325506037, 0.397253883862878,
1.57070527855864), vcat1 = structure(c(1L, 1L, 3L, 2L, 3L, 4L,
4L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 4L, 3L, 4L, 4L, 4L, 1L, 2L, 4L,
1L, 3L, 2L, 4L, 2L, 1L, 4L, 2L, 2L, 4L, 2L, 1L, 1L, 3L, 1L, 4L,
4L, 4L, 4L, 2L, 4L, 1L, 4L, 3L, 1L, 4L, 4L, 1L), .Label = c("1",
"2", "3", "4"), class = "factor")), .Names = c("vnum1", "vcat1"
), row.names = c(NA, 50L), class = "data.frame")
If I use tapply, I clearly see the missing category:
> with(mydf,tapply(vnum1, vcat1, mean))
1 2 3 4
0.09172749 0.48575555 NA 0.09632024
But it is totally ignored in aggregate function:
> aggregate(vnum1~vcat1, mydf, mean)
vcat1 vnum1
1 1 0.09172749
2 2 0.48575555
3 4 0.09632024
I want to get it in aggregate function also. How can I do it? Thanks.
In the formula method, use na.action = NULL to keep the NA result.
aggregate(vnum1 ~ vcat1, mydf, mean, na.action = NULL)
# vcat1 vnum1
# 1 1 0.09172749
# 2 2 0.48575555
# 3 3 NA
# 4 4 0.09632024
You could have also used the data frame method and not have this worry.
with(mydf, aggregate(list(vnum1 = vnum1), list(vcat1 = vcat1), mean))
Is it possible to return ddply results for only certain values of the splitting variable? For example, with the dataframe example:
example <- structure(list(shape = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 3L, 3L, 3L, 3L, 3L), .Label = c("circle", "square", "triangle"
), class = "factor"), property = structure(c(1L, 3L, 2L, 1L,
2L, 3L, 1L, 1L, 1L, 1L, 2L, 3L, 1L, 1L), .Label = c("color",
"intensity", "size"), class = "factor"), value = structure(c(5L,
2L, 1L, 5L, 4L, 1L, 5L, 6L, 6L, 7L, 4L, 3L, 6L, 5L), .Label = c("3",
"5", "6", "7", "blue", "green", "red"), class = "factor")), .Names = c("shape",
"property", "value"), class = "data.frame", row.names = c(NA,
-14L))
which looks like this
shape property value
1 circle color blue
2 circle size 5
3 circle intensity 3
4 circle color blue
5 square intensity 7
6 square size 3
7 square color blue
8 square color green
9 square color green
10 triangle color red
11 triangle intensity 7
12 triangle size 6
13 triangle color green
14 triangle color blue
I want to return a dataframe containing the number of each shape that has a certain color, which would be something like this:
shape property blue green red
1 circle color 2 0 0
2 square color 1 2 0
3 triangle color 1 1 1
However, I can't seem to get this to return properly! I've gotten part of the way using something like this:
ColorSummary <- ddply(example,.(shape,property="color"), function(example) summary(example$value))
But this is returning a dataframe with columns for all of the other unique value (from the properties size and intensity, which I do not want):
shape property 3 5 6 7 blue green red
1 circle color 1 1 0 0 2 0 0
2 square NA 1 0 0 1 1 2 0
3 triangle NA 0 0 1 1 1 1 1
What am I doing wrong - is there a way to return a dataframe like the first result that I showed?
Also, while this is a small and fast example, my "real" data are much bigger and take a long time to calculate. Does the speed of ddply improve by limiting to only property="color"?
EDIT: Thanks for the answers so far! Unfortunately for me, I oversimplified the situation and I'm not sure if the dcast solution will work for me. Let me explain - I am actually working with a dataframe example2:
example2 <- structure(list(factory = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("A",
"B"), class = "factor"), shape = structure(c(1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L), .Label = c("circle",
"square", "triangle"), class = "factor"), property = structure(c(1L,
3L, 2L, 1L, 2L, 3L, 1L, 1L, 1L, 1L, 2L, 3L, 1L, 1L, 1L, 3L, 2L
), .Label = c("color", "intensity", "size"), class = "factor"),
value = structure(c(5L, 2L, 1L, 5L, 4L, 1L, 5L, 6L, 6L, 7L,
4L, 3L, 6L, 5L, 5L, 2L, 1L), .Label = c("3", "5", "6", "7",
"blue", "green", "red"), class = "factor")), .Names = c("factory",
"shape", "property", "value"), class = "data.frame", row.names = c(NA,
-17L))
and I am trying to split by both factory and shape. I have a messy solution using ddply:
ColorSummary2 <- ddply(example2,.(factory,shape,property="color"), function(example2) summary(example2$value))
which gives
factory shape property 3 5 6 7 blue green red
1 A circle color 1 1 0 0 2 0 0
2 A square NA 1 0 0 1 1 2 0
3 A triangle NA 0 0 1 1 1 1 1
4 B circle NA 1 1 0 0 1 0 0
but what I would like to return is this (sorry for the messy table, I have trouble formatting tables on here):
factory shape property blue green red
1 A circle color 2 0 0
2 A square NA 1 2 0
3 A triangle NA 1 1 1
4 B circle NA 1 0 0
Is this possible?
EDIT 2: Sorry for all of the edits, I oversimplified my situation way too much. Here is a more complex dataframe that is closer to my real example. This one has a column state, which I do not want to use for splitting. I can do this (messily) with ddply, but can I ignore state using dcast?
example3 <- structure(list(state = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L), .Label = c("CA", "FL"
), class = "factor"), factory = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("A",
"B"), class = "factor"), shape = structure(c(1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L), .Label = c("circle",
"square", "triangle"), class = "factor"), property = structure(c(1L,
3L, 2L, 1L, 2L, 3L, 1L, 1L, 1L, 1L, 2L, 3L, 1L, 1L, 1L, 3L, 2L
), .Label = c("color", "intensity", "size"), class = "factor"),
value = structure(c(5L, 2L, 1L, 5L, 4L, 1L, 5L, 6L, 6L, 7L,
4L, 3L, 6L, 5L, 5L, 2L, 1L), .Label = c("3", "5", "6", "7",
"blue", "green", "red"), class = "factor")), .Names = c("state",
"factory", "shape", "property", "value"), class = "data.frame", row.names = c(NA,
-17L))
Using dcast from reshape2:
dcast(...~value,data=subset(example,property=='color'))
Aggregation function missing: defaulting to length
shape property blue green red
1 circle color 2 0 0
2 square color 1 2 0
3 triangle color 1 1 1
EDIT
using the second data set example:
dcast(...~value,data=subset(example2,property=='color'))
Aggregation function missing: defaulting to length
factory shape property blue green red
1 A circle color 2 0 0
2 A square color 1 2 0
3 A triangle color 1 1 1
4 B circle color 1 0 0