Ordering xtab BY proportion in R - r

For a sample dataframe:
df <- structure(list(region = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L), .Label = c("a", "b", "c", "d"), class = "factor"),
result = c(1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L), weight = c(0.126,
0.5, 0.8, 1.5, 5.3, 2.2, 3.2, 1.1, 0.1, 1.3, 2.5)), .Names = c("region",
"result", "weight"), row.names = c(NA, 11L), class = "data.frame")
df$region <- factor(df$region)
result <- xtabs(weight ~ region + result, data=df)
result
I want to reorder the 1s of the result column. As I understand it (from here), I could use order:
result <- result[order(result[, 2], decreasing=T),]
result
result
region 0 1
b 6.9 3.500
a 5.8 2.426
HOWEVER this appears to be just ordering by the number of 1s - I want instead to use the proportion of 1s in each region (i.e. percentage). How can I use order (or something else) to develop my xtab the way I want.

Use prop.table:
result[order(prop.table(result,1)[,2], decreasing=TRUE),]
# result
#region 0 1
# b 6.9 3.500
# a 5.8 2.426
Where prop.table(result,1) gives:
prop.table(result,1)
# result
#region 0 1
# a 0.7050814 0.2949186
# b 0.6634615 0.3365385

Related

Change the order/flip the xtab in R

For a sample dataframe:
df <- structure(list(region = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L), .Label = c("a", "b", "c", "d"), class = "factor"),
result = c(0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L), weight = c(0.126,
0.5, 0.8, 1.5, 5.3, 2.2, 3.2, 1.1, 0.1, 1.3, 2.5)), .Names = c("region",
"result", "weight"), row.names = c(NA, 11L), class = "data.frame")
I am producing a weighted xtab:
df$region <- factor(df$region)
result <- xtabs(weight ~ result + region, data=df)
result
Which is:
region
result a b
0 6.926 6.900
1 1.300 3.500
How can I flip the xtab around so the region and result variables are the other way around (i.e. region as rows and result as columns).
I thought this might work, but alas no!
result <- xtabs(region + (weight ~ result), data=df)
Any help would be much appreciated.
Just reverse the order of entries:
xtabs(weight ~ region + result, data=df)

Run function through all combinations of rows in R

For a sample dataframe:
df <- structure(list(area = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L,
4L, 4L, 4L), .Label = c("a1", "a2", "a3", "a4"), class = "factor"),
result = c(0L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L,
1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L),
weight = c(0.5, 0.8, 1, 3, 3.4, 1.6, 4, 1.6, 2.3, 2.1, 2,
1, 0.1, 6, 2.3, 1.6, 1.4, 1.2, 1.5, 2, 0.6, 0.4, 0.3, 0.6,
1.6, 1.8)), .Names = c("area", "result", "weight"), class = "data.frame", row.names = c(NA,
-26L))
I wish to calculate the risk difference between all combinations of areas (i.e. a1 and a2, a1 and a3, a2 and a3). Preferably this would be in a matrix form.
Up till now, I have just looked at comparing the risk difference (RD) between the regions with the highest and lowest results:
#Include only regions with highest or lowest percentage
df.summary <- data.table(df.summary)
incl <- df.summary[c(which.min(result), which.max(result)),area]
df.new <- df[df$area %in% incl,]
df.new$area <- factor(df.new$area)
#Run relative difference
df.xtabs <- xtabs(weight ~ result + area, data=df.new)
df.xtabs
#Produce xtabs table
RD.result <- prop.test(x=df.xtabs[,2], n=rowSums(df.xtabs), correct = FALSE)
RD <- round(- diff(RD.result$estimate), 3)
... But how would I change this to ensure the code runs through all combinations of areas without having to specify each one in turn? (I may have up to 19 areas).
You can do it using combn function. For example,
uniqueCombinations <- combn(unique(as.character(df$area)), 2)
resultDF <- data.frame(matrix(NA, nrow=dim(uniqueCombinations)[2], ncol=2+1))#2 col for unique combination and 1 for RD value
names(resultDF) <- c(paste0("area_", 1:2), "RD")
for(i in 1:dim(uniqueCombinations)[2]){
#iterate over a unique combination
incl <- uniqueCombinations[,i]
print(incl)
#Your code
df.new <- df[df$area %in% incl,]
df.new$area <- factor(df.new$area)
#Run relative difference
df.xtabs <- xtabs(weight ~ result + area, data=df.new)
df.xtabs
df.xtabs1 <- data.frame(df.xtabs)
#Produce xtabs table
RD.result <- prop.test(x=df.xtabs[,2], n=rowSums(df.xtabs), correct = FALSE)
RD <- round(- diff(RD.result$estimate), 3)
resultDF[i, 1:2] <- incl
resultDF[i, 3] <- RD
}
resultDF
UPDATE : code update to create a resultDF, which will have result from loop.

Problems with levels in a xtab in R

For a sample dataframe:
df <- structure(list(area = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L,
4L, 4L, 4L), .Label = c("a1", "a2", "a3", "a4"), class = "factor"),
result = c(0L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L,
1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L),
weight = c(0.5, 0.8, 1, 3, 3.4, 1.6, 4, 1.6, 2.3, 2.1, 2,
1, 0.1, 6, 2.3, 1.6, 1.4, 1.2, 1.5, 2, 0.6, 0.4, 0.3, 0.6,
1.6, 1.8)), .Names = c("area", "result", "weight"), class = "data.frame", row.names = c(NA,
-26L))
I am trying to isolate areas with the highest and lowest regions and then produce a weighted crosstab which is then used to calculate risk difference.
df.summary <- setDT(df)[,.(.N, freq.1 = sum(result==1), result = weighted.mean((result==1),
w = weight)*100), by = area]
#Include only regions with highest or lowest percentage
df.summary <- data.table(df.summary)
incl <- df.summary[c(which.min(result), which.max(result)),area]
df.new <- df[df$area %in% incl,]
incl
'incl' has the two areas that I want, but still the four levels:
[1] a2 a3
Levels: a1 a2 a3 a4
How do I get rid of the levels as well? The subsequent analysis that I want to do needs just the two levels as well as the areas. Any ideas?
I found this elsewhere on the web (e.g. Problems with levels in a xtab in R)
df.new$area <- factor(df.new$area)
It works!
Hope it's useful for others.

Subset a dataframe based on identifying max and min values in a column (in R)

For a sample dataframe:
df1 <- structure(list(id = 1:21, region = structure(c(1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L,
4L), .Label = c("a", "b", "c", "d"), class = "factor"), weight = c(0.35,
0.65, 0.99, 1.5, 3.2, 2.1, 1.3, 3.2, 1.3, 2, 0.6, 0.6, 0.6, 0.45,
1, 1.2, 1.4, 2, 1.3, 1, 2), condition = c(0L, 1L, 0L, 1L, 0L,
0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 0L
)), .Names = c("id", "region", "weight", "condition"), class = "data.frame", row.names = c(NA,
-21L))
I wish to exclude the regions which do not have either the highest or lowest number of 1s in the result variable by region. For example, I would normally do:
summary <- setDT(df)[,.(.result = weighted.mean((condition==1),
w = weight)*100), by = region]
Which would give me:
summary
region .result
1: a 61.60458
2: b 39.69466
3: c 50.56180
4: d 61.03896
Therefore I would subset regions c and d from the dataframe df.
Is it possible to do this in one step without having to manually look at a summary dataframe?
My understanding is that you wish to exclude all values that are not the highest and lowest values. It can't be done as a one liner, but if you add the following, you should get what you want:
incl <- summary[c(which.min(.result), which.max(.result)),region]
newdf <- df1[region %in% incl,]
newdf
id region weight condition
1: 5 b 3.20 0
2: 6 b 2.10 0
3: 7 b 1.30 0
4: 8 b 3.20 1
5: 9 b 1.30 0
6: 10 b 2.00 1
7: 1 a 0.35 0
8: 2 a 0.65 1
9: 3 a 0.99 0
10: 4 a 1.50 1

By row, get mean count of number of columns between values of x

I have a data.frame that contains several columns (i.e. V1...Vn+1) that have a value of 1 or 0, each column is a timestep.
I want to know the average time (# of columns) between values of 1. With a sequence of 1 1 1 1 1 1 having a value of 1.
At the moment the way I can think to possibly compute this would to be to calculate the mean count (+1) of 0s between 1s, but it is flawed.
For example, a row that had these values 1 0 0 1 0 1 would have the result 2.5 (2 + 1 = 3; 3/2 = 1.5; 1.5 + 1 = 2.5).
However, if the sequence begins or ends with 0s the results for this results should be calculated without them. For example, 0 1 0 0 1 1 would be computed as 1 0 0 1 1 with a result of 3.
Flawed e.g. 1 0 1 1 0 0 would be computed as 1 0 1 1 resulting in 2, but this would not be the desired result (1.5)
Is there a way to count the the numbers of columns between values of 1 by row, considering the issues with starting or ending with zeros?
# example data.frame with desired result
df <- structure(list(Trial = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Location = c(1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L), Position = c(1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L), V1 = c(1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L), V2 = c(1L,
1L, 1L, 0L, 1L, 0L, 0L, 0L), V3 = c(1L, 1L, 1L, 0L, 1L, 0L, 0L,
1L), V4 = c(1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L), V5 = c(1L, 0L, 0L,
0L, 1L, 0L, 0L, 0L), V6 = c(1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L),
Result = c(1, 3, 2, NA, 1, 2.5, 3, 1.5)), .Names = c("Trial",
"Location", "Position", "V1", "V2", "V3", "V4", "V5", "V6", "Result"
), class = "data.frame", row.names = c(NA, -8L))
df1 <- df[,4:9]
#This code `apply(df1,1,function(x) which(rev(x)==1)[1])) calculates the number of columns back until a value of 1, or forward without `rev`. But this doesn't quite help with the flaw.
If the range between the first and last 1 value is k and the total number of 1s in that range is n, then the average gap is (k-1)/(n-1). You can compute this with:
apply(df1, 1, function(x) {
w <- which(x == 1)
if (length(w) <= 1) NA
else diff(range(w)) / (length(w)-1)
})
# [1] 1.0 2.0 2.0 NA 1.0 2.5 3.0 1.5

Resources