Change the order in a xtab - r

For a sample data.frame:
df <- structure(list(region = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L), .Label = c("a", "b", "c", "d"), class = "factor"),
result = c(0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L), weight = c(0.126,
0.5, 0.8, 1.5, 5.3, 2.2, 3.2, 1.1, 0.1, 1.3, 2.5)), .Names = c("region",
"result", "weight"), row.names = c(NA, 11L), class = "data.frame")
df$region <- factor(df$region)
result <- xtabs(weight ~ region + result, data=df)
result
How would I go about changing the order of the xtab (I don't want to switch the axes which I asked previously)? For example ensuring 1 was always the first result or b was the first region.
If anyone has any ideas, I would appreciate it. I am doing some followup statistics on this data and I need to ensure my values are the right way round.

Try the following, using relevel, which resets the base level of a factor:
df$region <- relevel(df$region, "b")
xtabs(weight ~ region + result, data=df)
Perhaps the best way to do this if you want to order multiple levels is to set these up initially when first using factor:
df$region2 <- df$region <- factor(df$region, levels=c("b", "c", "a"))
or whatever order you want.

Related

Using method="nls" to plot 3 parameter Weibull curve

I would like to plot several 3 parameter Weibull curves with the function y ~ mexp(-1(x/b)^c).
In most cases I could define the starting parameters and plot the curves using nls2 and ggplot2. However, I fail to plot one particular curve in ggplot2 (I could fit the curves using nls2 though).
It is important to me to use the the method="nls" command in ggplot (and not loess, the latter tends to overfit the curves in many cases).
Could anyone help me with this? Much appreciated!
#Load packages
library(ggplot2)
library(nlme)
library(nls2)
# library(proto)
#model structure: 3 parameter Weibull
#y ~ m*exp(-1*(x/b)^c)
dip3dta<-structure(list(ploidy = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = c("dip", "trp"), class = "factor"),
geno = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L), .Label = c("dip1", "dip2", "dip3", "dip4", "dip5",
"dip6", "dip7", "dip8", "trp1", "trp2", "trp3", "trp4", "trp5",
"trp6", "trp7", "trp8"), class = "factor"), Photo = c(10.03907124,
16.04016877, 5.799933798, 6.256058037, 1.34916505, 9.609508391,
12.84023945, 8.436093321, 7.732332332, 15.38729611, 2.157795186,
5.93553951, 3.37322132), WBPhoto = c(11.77970983, 13.52705488,
7.585920181, 6.118582453, 2.570461685, 10.80358492, 9.445462376,
5.386306724, 5.840252952, 15.84494637, 3.60398487, 9.32456564,
3.437440219), PDLWP = c(3.1, 2.6, 5.8, 7.7, 19, 3.5, 4.25,
9, 8.16, 2.25, 13.92, 4.33, 14.58), Treatment = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "DC", class = "factor")), row.names = 27:39, class = "data.frame")
#Define the outer boundaries to search for initial values
grddip3 <- data.frame(m=c(0,45),
b=c(0,8),
cc=c(0,0.6))
#Brute-force initial values
fitdip3 <- nls2(Photo ~ m*exp(-1*(PDLWP/b)^cc),
data=dip3dta,
start = grddip3,
algorithm = "brute-force")
fitdip3
finalfitdip3 <- nls(Photo ~ m*exp(-1*(PDLWP/b)^cc),
data=dip3dta,
start=as.list(coef(fitdip3)),
algorithm = "port",
lower = coef(fitdip3)/1000,
control=list(maxiter=1000))
The above code works fine but I can't plot the curve in ggplot using method="nls"
# Plot in ggplot
exampledip3<-ggplot(dip3dta, aes(x=PDLWP,y=Photo)) + geom_point(size=2)+
theme_classic()+
theme(legend.position="none")+
theme(axis.text=element_text(size=18),
axis.title=element_text(size=17,),axis.title.y=element_text(margin=margin(0,20,0,0)))+
stat_smooth(method = "nls", formula = y ~ m*exp(-1*(x/b)^c), size = 0.9, se = FALSE, colour = "black")

In R I want to select max timepoints, while grouping?

I want to select the minimum Timepoint, maximum Timepoint and the duration (difference between max and min) grouped by Replicate, Stimulus, Attribute and Complexity
structure(list(Replicate = c(1L, 1L, 1L, 1L, 1L, 1L), Stimulus = c(1L,
1L, 1L, 1L, 1L, 1L), Subject = c("S001", "S001", "S001", "S001",
"S001", "S001"), Attribute = c("Soft", "Soft", "Soft", "Soft",
"Soft", "Soft"), Timepoint = c(0.77, 0.78, 0.79, 0.8, 0.81, 0.82
), Dominant = c(1L, 1L, 1L, 1L, 1L, 1L), Complexity = c(2L, 2L,
2L, 2L, 2L, 2L)), row.names = c(NA, 6L), class = "data.frame")
I am using the following code
modified_tds_merged2<-tds_merged.df %>%
as.data.frame() %>%
mutate(Timepoint = as.numeric(gsub("[a-zA-Z]+", "", Timepoint))) %>%
group_by(Replicate, Stimulus, Subject, Attribute, Complexity) %>%
summarise(
start_time = min(Timepoint),
end_time = max(Timepoint),
duration = end_time - start_time,
n = n()
) %>%
ungroup()
However the result is inncorrect, the endtimes are often incorrect as they overlap when ther can be only 1 Complexity rating at any timepoint, and seem random. Here is an example of the result. You can see that for S008, crumbly_particles, Complexity rating of 3 goes from 0.47 to 0.71 and Complexity rating of 4 goes from 0.51 to 0.66. When I check back on tds_merged.df, Complexity rating 3 should be from .47 to.50 and .67 to .71, and Complexity rating 4 is from .51 to .66 which is correct. So it looks like my code doesn't specify that if the Complexity rating changes form 3 to 4 and back to 3, the two 3 ratings need to be calculated sperarately.
structure(list(Replicate = c(1L, 1L, 1L, 1L, 1L, 1L), Stimulus = c(1L,
1L, 1L, 1L, 1L, 1L), Subject = c("S001", "S004", "S004", "S008",
"S008", "S008"), Attribute = c("Soft", "Crumbly_Particles", "Soft",
"Crumbly_Particles", "Crumbly_Particles", "Crunchy"), Complexity = c(2L,
2L, 2L, 3L, 4L, 2L), start_time = c(0.77, 0.95, 0.19, 0.47, 0.51,
0.79), end_time = c(0.99, 0.99, 0.94, 0.71, 0.66, 0.82), duration = c(0.22,
0.04, 0.75, 0.24, 0.15, 0.0299999999999999), n = c(23L, 5L, 76L,
9L, 16L, 4L)), row.names = c(NA, -6L), class = c("tbl_df", "tbl",
"data.frame"))
Sorry for long "question"!! Hope someone can help. Maybe the problem is using max and min, is there another function, I am new to R?
"So it looks like my code doesn't specify that if the Complexity rating changes". Correct. group_by sorts your data (whether explicitly or implicitly is not always clear, but that's another story). So your solution is to introduce another variable, RunID for example, that changes every time Complexity changes within Replicate, Stimulus, Subject and Attribute.
As suggested above I used the rleid function to creat a new group ID variable to allow for the situation in my question.

Re-levelling in R for a xtab based on a condition

For a sample dataframe:
df <- structure(list(region = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L), .Label = c("a", "b", "c", "d"), class = "factor"),
result = c(1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L), weight = c(0.126,
0.5, 0.8, 1.5, 5.3, 2.2, 3.2, 1.1, 0.1, 1.3, 2.5)), .Names = c("region",
"result", "weight"), row.names = c(NA, 11L), class = "data.frame")
I draw a cross tabulation using:
df$region <- factor(df$region)
result <- xtabs(weight ~ region + result, data=df)
result
However I want to ensure the regions of the xtab are in order of magnitude of percentage 1s in sample. (i.e. 1s represent 29% of region a and 33% of region b). Therefore I would like the xtab to be reordered, so region b is first, then a.
I know I could use relevel, however this would be dependent on me looking at the result and re-levelling where appropriate.
Instead I want this to be automatic in the code and not dependent on the user (as this code will be running lots of times, and completing further analysis on the resulting xtab).
If anyone has any ideas, I would greatly appreciate it.
You can reorder the xtab on the values of the second column using order as follows:
result[order(result[, 2], decreasing=T),]
order ranks the values, adding decreasing=T ranks from top to bottom.

plotting 3 variables on a single plot in ggplot2

Hi have an experiment which consists of three variables, and I would like to plot them all on a single plot.
This is my df:
AB <- data.frame(block=c("A", "A", "A", "A", "B", "B", "B", "B" ),
familiarity=c("fam", "fam", "unfam", "unfam" ),
prime=c("P", "UP" ),
RT=c("570.6929", "628.7446", "644.6268", "607.4312", "556.3581", "645.4821", "623.5624", "604.4113"))
Right now I can only break one of the variables into two separate plots, like this where A and B are the two levels of the third variable:
A <- AB[which(AB$block == "A"),]
B <- AB[which(AB$block == "B"),]
pa <- ggplot(data=A, aes(x=prime, y=RT, group=familiarity)) +
geom_line(aes(linetype=familiarity), size=1) +
expand_limits(y=c(500,650))
pb <- ggplot(data=B, aes(x=prime, y=RT, group=familiarity)) +
geom_line(aes(linetype=familiarity), size=1) +
expand_limits(y=c(500,650))
I would like to superimpose plot A over plot B, and have this third variables to be identified by color.
Any ideas?
Is this what you mean?
p_all <- ggplot(AB, aes(x=prime,y=RT,group=interaction(familiarity,block))) +
geom_line(aes(linetype=familiarity,color=block))
Data used:
AB <- structure(list(block = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L), .Label = c("A", "B"), class = "factor"), familiarity = structure(c(1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L), class = "factor", .Label = c("fam",
"unfam")), prime = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L
), class = "factor", .Label = c("P", "UP")), RT = c(570.6929,
628.7446, 644.6268, 607.4312, 556.3581, 645.4821, 623.5624, 604.4113
)), .Names = c("block", "familiarity", "prime", "RT"), row.names = c(NA,
-8L), class = "data.frame")
IF you have different datasets for those variables, then you can specify the data
ggplot()+
geom_line(data=A, aes(x=prime, y=RT, group=familiarity,linetype=familiarity), size=1) +
geom_line(data=B, aes(x=prime, y=RT, group=familiarity,linetype=familiarity), size=1)+
expand_limits(y=c(500,650))

How do I understand the warnings from rbind?

If I have two data.frames with the same column names, I can use rbind to make a single data frame. However, if I have one is a factor and the other is an int, I get a warning like this:
Warning message: In [<-.factor(*tmp*, ri, value = c(1L, 1L, 0L,
0L, 0L, 1L, 1L, : invalid factor level, NA generated
The following is a simplification of the problem:
t1 <- structure(list(test = structure(c(1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 2L), .Label = c("False", "True"), class = "factor")), .Names = "test", row.names = c(NA,
-10L), class = "data.frame")
t2 <- structure(list(test = c(1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L
)), .Names = "test", row.names = c(NA, -10L), class = "data.frame")
rbind(t1, t2)
With the single column, this is easy to understand, but when it is part of a dozen or more factors, it can be difficult. What is there about the warning message to tell me which column to look at? Barring that, what is a good technique to understand which column is in error?
You could knock up a simple little comparison script using class and mapply, to compare where the rbind will break down due to non-matching data types, e.g.:
one <- data.frame(a=1,b=factor(1))
two <- data.frame(b=2,a=2)
common <- intersect(names(one),names(two))
mapply(function(x,y) class(x)==class(y), one[common], two[common])
# a b
# TRUE FALSE
Based on thelatemail's answer, here is a function to compare two data.frames for rbinding:
mergeCompare <- function(one, two) {
cat("Distinct items: ", setdiff(names(one),names(two)), setdiff(names(two),names(one)), "\n")
print("Non-matching items:")
common <- intersect(names(one),names(two))
print (mapply(function(x,y) {class(x)!=class(y)}, one[common], two[common]))
}

Resources