Run function through all combinations of rows in R - r

For a sample dataframe:
df <- structure(list(area = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L,
4L, 4L, 4L), .Label = c("a1", "a2", "a3", "a4"), class = "factor"),
result = c(0L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L,
1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L),
weight = c(0.5, 0.8, 1, 3, 3.4, 1.6, 4, 1.6, 2.3, 2.1, 2,
1, 0.1, 6, 2.3, 1.6, 1.4, 1.2, 1.5, 2, 0.6, 0.4, 0.3, 0.6,
1.6, 1.8)), .Names = c("area", "result", "weight"), class = "data.frame", row.names = c(NA,
-26L))
I wish to calculate the risk difference between all combinations of areas (i.e. a1 and a2, a1 and a3, a2 and a3). Preferably this would be in a matrix form.
Up till now, I have just looked at comparing the risk difference (RD) between the regions with the highest and lowest results:
#Include only regions with highest or lowest percentage
df.summary <- data.table(df.summary)
incl <- df.summary[c(which.min(result), which.max(result)),area]
df.new <- df[df$area %in% incl,]
df.new$area <- factor(df.new$area)
#Run relative difference
df.xtabs <- xtabs(weight ~ result + area, data=df.new)
df.xtabs
#Produce xtabs table
RD.result <- prop.test(x=df.xtabs[,2], n=rowSums(df.xtabs), correct = FALSE)
RD <- round(- diff(RD.result$estimate), 3)
... But how would I change this to ensure the code runs through all combinations of areas without having to specify each one in turn? (I may have up to 19 areas).

You can do it using combn function. For example,
uniqueCombinations <- combn(unique(as.character(df$area)), 2)
resultDF <- data.frame(matrix(NA, nrow=dim(uniqueCombinations)[2], ncol=2+1))#2 col for unique combination and 1 for RD value
names(resultDF) <- c(paste0("area_", 1:2), "RD")
for(i in 1:dim(uniqueCombinations)[2]){
#iterate over a unique combination
incl <- uniqueCombinations[,i]
print(incl)
#Your code
df.new <- df[df$area %in% incl,]
df.new$area <- factor(df.new$area)
#Run relative difference
df.xtabs <- xtabs(weight ~ result + area, data=df.new)
df.xtabs
df.xtabs1 <- data.frame(df.xtabs)
#Produce xtabs table
RD.result <- prop.test(x=df.xtabs[,2], n=rowSums(df.xtabs), correct = FALSE)
RD <- round(- diff(RD.result$estimate), 3)
resultDF[i, 1:2] <- incl
resultDF[i, 3] <- RD
}
resultDF
UPDATE : code update to create a resultDF, which will have result from loop.

Related

Ordering xtab BY proportion in R

For a sample dataframe:
df <- structure(list(region = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L), .Label = c("a", "b", "c", "d"), class = "factor"),
result = c(1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L), weight = c(0.126,
0.5, 0.8, 1.5, 5.3, 2.2, 3.2, 1.1, 0.1, 1.3, 2.5)), .Names = c("region",
"result", "weight"), row.names = c(NA, 11L), class = "data.frame")
df$region <- factor(df$region)
result <- xtabs(weight ~ region + result, data=df)
result
I want to reorder the 1s of the result column. As I understand it (from here), I could use order:
result <- result[order(result[, 2], decreasing=T),]
result
result
region 0 1
b 6.9 3.500
a 5.8 2.426
HOWEVER this appears to be just ordering by the number of 1s - I want instead to use the proportion of 1s in each region (i.e. percentage). How can I use order (or something else) to develop my xtab the way I want.
Use prop.table:
result[order(prop.table(result,1)[,2], decreasing=TRUE),]
# result
#region 0 1
# b 6.9 3.500
# a 5.8 2.426
Where prop.table(result,1) gives:
prop.table(result,1)
# result
#region 0 1
# a 0.7050814 0.2949186
# b 0.6634615 0.3365385

Change the order/flip the xtab in R

For a sample dataframe:
df <- structure(list(region = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L), .Label = c("a", "b", "c", "d"), class = "factor"),
result = c(0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L), weight = c(0.126,
0.5, 0.8, 1.5, 5.3, 2.2, 3.2, 1.1, 0.1, 1.3, 2.5)), .Names = c("region",
"result", "weight"), row.names = c(NA, 11L), class = "data.frame")
I am producing a weighted xtab:
df$region <- factor(df$region)
result <- xtabs(weight ~ result + region, data=df)
result
Which is:
region
result a b
0 6.926 6.900
1 1.300 3.500
How can I flip the xtab around so the region and result variables are the other way around (i.e. region as rows and result as columns).
I thought this might work, but alas no!
result <- xtabs(region + (weight ~ result), data=df)
Any help would be much appreciated.
Just reverse the order of entries:
xtabs(weight ~ region + result, data=df)

Developing a function to analyse rows of a data.table in R

For a sample dataframe:
df1 <- structure(list(area = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("a",
"b"), class = "factor"), region = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("a1",
"a2", "b1", "b2"), class = "factor"), weight = c(0, 1.2, 3.2,
2, 1.6, 5, 1, 0.5, 0.2, 0, 1.5, 2.3, 1.5, 1.8, 1.6, 2, 1.3, 1.4,
1.5, 1.6, 2, 3, 4, 2.3, 1.3, 2.1, 1.3, 1.6, 1.7, 1.8, 2, 1.3,
1, 0.5), var.1 = c(0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 1L,
1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 1L,
0L, 0L, 0L, 1L, 0L, 1L, 0L), var.2 = c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L,
1L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L)), .Names = c("area",
"region", "weight", "var.1", "var.2"), class = c("data.table",
"data.frame"))
I want to first produce a summary table...
area_summary <- setDT(df1)[,.(.N, freq.1 = sum(var.1==1), result = weighted.mean((var.1==1),
w = weight)*100), by = area]
...and then populate it by running the following code for each area (e.g. a, b). This looks for the highest and lowest 'result' in each region, and then produces a xtabs and calculates the relative difference (RD) before adding these to the summary table. Here I have developed the code for area 'a':
#Include only regions with highest or lowest percentage
a_cntry <- subset(df1, area=="a")
a_cntry.summary <- setDT(a_cntry)[,.(.N, freq.1 = sum(var.1==1), result = weighted.mean((var.1==1),
w = weight)*100), by = region]
#Include only regions with highest or lowest percentage
incl <- a_cntry.summary[c(which.min(result), which.max(result)),region]
region <- as.data.frame.matrix(a_cntry)
a_cntry <- a_cntry[a_cntry$region %in% incl,]
#Produce xtabs table of RD
a_cntry.var.1 <- xtabs(weight ~ var.1 + region, data=a_cntry)
a_cntry.var.1
#Produce xtabs table
RD.var.1 <- prop.test(x=a_cntry.var.1[,2], n=rowSums(a_cntry.var.1), correct = FALSE)
RD <- round(- diff(RD.var.1$estimate), 3)
RDpvalue <- round(RD.var.1$"p.value", 4)
RD
RDpvalue
#Add RD and RDpvalue tosummary table
area_summary$RD[area_summary$area == "a"] <- RD
area_summary$RDpvalue[area_summary$area == "a"] <- RDpvalue
rm(RD, RD.var.1, RDpvalue, a_cntry.var.1, incl, a_cntry,a_cntry.summary,region)
I wish to wrap this code into a function, so I can just specify the 'areas' (in the 'area' column in df1) and then the code completes all the analysis and adds the results to the summary table.
If I wanted to call my function stats, I understand it may start like this:
stats= function (df1, x) {
apply(x)
}
If anyone can start me off developing my function, I should be most grateful.

Problems with levels in a xtab in R

For a sample dataframe:
df <- structure(list(area = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L,
4L, 4L, 4L), .Label = c("a1", "a2", "a3", "a4"), class = "factor"),
result = c(0L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L,
1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L),
weight = c(0.5, 0.8, 1, 3, 3.4, 1.6, 4, 1.6, 2.3, 2.1, 2,
1, 0.1, 6, 2.3, 1.6, 1.4, 1.2, 1.5, 2, 0.6, 0.4, 0.3, 0.6,
1.6, 1.8)), .Names = c("area", "result", "weight"), class = "data.frame", row.names = c(NA,
-26L))
I am trying to isolate areas with the highest and lowest regions and then produce a weighted crosstab which is then used to calculate risk difference.
df.summary <- setDT(df)[,.(.N, freq.1 = sum(result==1), result = weighted.mean((result==1),
w = weight)*100), by = area]
#Include only regions with highest or lowest percentage
df.summary <- data.table(df.summary)
incl <- df.summary[c(which.min(result), which.max(result)),area]
df.new <- df[df$area %in% incl,]
incl
'incl' has the two areas that I want, but still the four levels:
[1] a2 a3
Levels: a1 a2 a3 a4
How do I get rid of the levels as well? The subsequent analysis that I want to do needs just the two levels as well as the areas. Any ideas?
I found this elsewhere on the web (e.g. Problems with levels in a xtab in R)
df.new$area <- factor(df.new$area)
It works!
Hope it's useful for others.

Using Rs data.table on (weighted) survey data [duplicate]

This question already has answers here:
How to compute weighted mean in R?
(2 answers)
Closed 7 years ago.
For a sample dataframe:
df <- structure(list(id = 1:25, region.1 = structure(c(1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L, 4L), .Label = c("AT1", "AT2", "AT3", "AT4"
), class = "factor"), gndr = c(0L, 1L, 0L, 0L, 0L, 1L, 0L, 1L,
1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L,
1L), PoorHealth = c(0L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L,
0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L), weight = c(0.3,
1.6, 2.5, 3.5, 0.2, 0.2, 0.2, 0.6, 0.15, 0.25, 1.36, 1, 1, 1,
0.1, 0.2, 0.3, 0.3, 0.3, 0.4, 0.3, 1, 1.4, 1.3, 0.4)), .Names = c("id",
"region.1", "gndr", "PoorHealth", "weight"), class = c("data.table",
"data.frame"), row.names = c(NA, -25L))
I wish to create a summary data table (using data.table) using the code:
variable.table_1 <- setDT(df)[,.(.N,result=sum((PoorHealth==1)/.N)*100),
by=region.1]
However my original data is from a survey and I therefore have a design and population weight which I have multiplied together (following the guidance from the survey, and have called this variable 'weight').
How do I apply an appropriate weighting of my 'result' variable in variable.table_1?
Perhaps I have to use the survey package? Looking here seems to adjust I have to first run my dataframe through the survey package...
library(survey)
df.w <- svydesign(id = ~1, data = df, weights = df$weight)
... but I am unsure how I incorporate the results into my summary data table.
Many thanks in advance.
Perhaps you can use the weighted.mean function
variable.table_1 <- setDT(df)[,.(.N, result = weighted.mean((PoorHealth==1),
w = weight)*100), by = region.1]
In your example you could also simply use mean instead of sum in combination wiht /.N.

Resources