Subset using 'IF' and 'BY' in R - r

For a sample dataframe:
df <- structure(list(id = 1:19, region.1 = structure(c(1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 5L
), .Label = c("AT1", "AT2", "AT3", "AT4", "AT5"), class = "factor"),
PoorHealth = c(0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L)), .Names = c("id", "region.1",
"PoorHealth"), class = "data.frame", row.names = c(NA, -19L))
I want to subset using the BY command, and hoped somebody may be able to help me.
I want to INCLUDE regions (regions.1) in df that satisfy this condition:
Less than (or equal to) 3 occurrences of '1' in the variable 'PoorHealth'
OR this condition:
Where N (i.e. the respondents in each region) is less than or equal to 6.
If anyone has any ideas to help me, I should be very grateful.

This should work. Dno if there is a cleaner way:
library(data.table)
setDT(df)
qualified_regions = df[,which((sum(PoorHealth==1) <=3 | .N <= 6)),region.1][,region.1]
df[region.1 %in% qualified_regions,]
E: I removed the !-mark because OP changed "EXCLUDE" to "INCLUDE" in the original question.

Related

HSD.test row names error. How do I check row names?

I have a dataframe for which I did a two-way ANOVA.
dput(m3)
structure(list(Delta = c(-40, -40, -40, -40, -31.7, -29.3, -27.8,
-26.7, -26.2, -25.4, -24.7, -23.1, -23, -22.9, -22.4, -22.2,
-21.4, -21, -20.8, -15.1, -14.9, -14.1, -6.2, -6.2, -6, -5.3,
-4.9), Location = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 3L, 2L,
3L, 3L, 3L), .Label = c("int", "pen + int", "ter + pen"), class = "factor"),
Between = c(0L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 0L, 2L, 1L, 0L,
1L, 0L, 2L, 0L, 2L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L
), Relative = structure(c(5L, 6L, 6L, 7L, 8L, 3L, 3L, 4L,
5L, 4L, 3L, 5L, 3L, 5L, 7L, 5L, 4L, 6L, 3L, 3L, 6L, 2L, 1L,
2L, 1L, 1L, 1L), .Label = c("1&2", "2&3", "2&4", "2&5", "3&4",
"3&5", "3&6", "4&6"), class = "factor")), class = "data.frame", row.names = c(NA,
-27L))
library(agricolae)
aov.2sum=aov(Delta.~Location*X.between, data=m3)
I want to analyze the data using a HSD.test as I have for another dataframe using the same features.
I am following the code format in the package manual as below.
tx <- with(m3, interaction(Location, X.between))
amod <-aov(Delta~tx, data=m3)
test=HSD.test(amod, "tx", group=TRUE)
Then I receive the following error
Error in .rowNamesDF<-(x, value = value) :
duplicate 'row.names' are not allowed
In addition: Warning message:
non-unique values when setting 'row.names': ‘int.0’, ‘pen + int.1’, ‘pen + int.2’, ‘te + int.0’, ‘te + int.1’
Upon further analysis I see that my duplicate row names error is related to my X.between feature. When I use the following code I get the same duplicate row names error:
HSD.test(amod, "X.between", group=TRUE)
>> Error in data.frame(row.names = means[, 1], means[, 2:6]) :
duplicate row.names: 0, 1, 2
How are row names chosen for the HSD.test?
Then how can I change my row names? Or just avoid this duplication error?
Thank you for all and any help.

set removes variable in all data frames in workspace

I have a simple question, to which I have not been able to find a solution here:
When I want to keep a selection of variables from a data frame, the variables get removed from all copies of that data frame loaded in my workspace.
Is there a way to only remove it from a single data frame?
A reproducible example (only remove it from df and not df2)?
require(data.table)
df <- structure(list(group = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L), x = c(0L, 0L, 0L, 1L,
1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 1L),
time = c(1636L, 1637L, 1638L, 1639L, 1640L, 1641L, 1642L,
1683L, 1684L, 1685L, 1686L, 1687L, 1688L, 1689L, 1690L, 1691L,
1638L, 1639L, 1640L)), .Names = c("group", "x", "time"), class = "data.frame", row.names = c(NA,
-19L))
df2 <- df
varstokeep <- c("group","x")
vartodrop <- which(!names(df)%in%varstokeep)
set(df, i=NULL, j=vartodrop, value=NULL)
The reason is that I have a large file, which I use as the basis for multiple (more aggregated) files. Having to load the basic file 6 times would take a lot more time.

Developing a function to analyse rows of a data.table in R

For a sample dataframe:
df1 <- structure(list(area = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("a",
"b"), class = "factor"), region = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("a1",
"a2", "b1", "b2"), class = "factor"), weight = c(0, 1.2, 3.2,
2, 1.6, 5, 1, 0.5, 0.2, 0, 1.5, 2.3, 1.5, 1.8, 1.6, 2, 1.3, 1.4,
1.5, 1.6, 2, 3, 4, 2.3, 1.3, 2.1, 1.3, 1.6, 1.7, 1.8, 2, 1.3,
1, 0.5), var.1 = c(0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 1L,
1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 1L,
0L, 0L, 0L, 1L, 0L, 1L, 0L), var.2 = c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L,
1L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L)), .Names = c("area",
"region", "weight", "var.1", "var.2"), class = c("data.table",
"data.frame"))
I want to first produce a summary table...
area_summary <- setDT(df1)[,.(.N, freq.1 = sum(var.1==1), result = weighted.mean((var.1==1),
w = weight)*100), by = area]
...and then populate it by running the following code for each area (e.g. a, b). This looks for the highest and lowest 'result' in each region, and then produces a xtabs and calculates the relative difference (RD) before adding these to the summary table. Here I have developed the code for area 'a':
#Include only regions with highest or lowest percentage
a_cntry <- subset(df1, area=="a")
a_cntry.summary <- setDT(a_cntry)[,.(.N, freq.1 = sum(var.1==1), result = weighted.mean((var.1==1),
w = weight)*100), by = region]
#Include only regions with highest or lowest percentage
incl <- a_cntry.summary[c(which.min(result), which.max(result)),region]
region <- as.data.frame.matrix(a_cntry)
a_cntry <- a_cntry[a_cntry$region %in% incl,]
#Produce xtabs table of RD
a_cntry.var.1 <- xtabs(weight ~ var.1 + region, data=a_cntry)
a_cntry.var.1
#Produce xtabs table
RD.var.1 <- prop.test(x=a_cntry.var.1[,2], n=rowSums(a_cntry.var.1), correct = FALSE)
RD <- round(- diff(RD.var.1$estimate), 3)
RDpvalue <- round(RD.var.1$"p.value", 4)
RD
RDpvalue
#Add RD and RDpvalue tosummary table
area_summary$RD[area_summary$area == "a"] <- RD
area_summary$RDpvalue[area_summary$area == "a"] <- RDpvalue
rm(RD, RD.var.1, RDpvalue, a_cntry.var.1, incl, a_cntry,a_cntry.summary,region)
I wish to wrap this code into a function, so I can just specify the 'areas' (in the 'area' column in df1) and then the code completes all the analysis and adds the results to the summary table.
If I wanted to call my function stats, I understand it may start like this:
stats= function (df1, x) {
apply(x)
}
If anyone can start me off developing my function, I should be most grateful.

identifying rows in data frame that exhibit patterns

Below I have code with 3 columns: a group field, a open/close field for the store, and the rolling sum of 3 month opens for the store. I also have the desired solution output.
My dataset can be thought of as an employees availability. You can assume each row to be a different time period (hour, day,month, year, whatever). In the open/closed column I have whether or not the employee was present. The 3month rolling column is a sum of the previous rows.
What I want to identify is the non-zero values in this rolling sum column following a gap of at least 3 zero rows for that particular group. While not present in this dataset, you can assume that there might be more than one 'gap' of zeros present.
structure(list(Group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L), .Label = c("A", "B"), class = "factor"), X0_closed_1_open = c(0L,
1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L), X3month_roll_open = c(0L,
0L, 1L, 2L, 2L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 2L, 0L, 1L, 1L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L), desired_solution = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("no", "yes"), class ="factor")), .Names = c("Group", "X0_closed_1_open", "X3month_roll_open", "desired_solution"), class = "data.frame", row.names = c(NA,
-26L))
One option is:
res <- unsplit(
lapply(split(df1, df1$Group), function(x) {
rl <- with(x,rle(X3month_roll_open==0))
indx <- cumsum(c(0,diff(inverse.rle(within.list(rl,
values[values] <- lengths[values]>=3)))<0))
x$Flag <- indx!=0 & x[,3]!=0
x}),
df1$Group)
NOTE: Instead of 'yes/no', it may be better to have 'TRUE/FALSE' for easing subsetting.
identical(c('no', 'yes')[res$Flag+1L], as.character(res$desired_solution))
#[1] TRUE

Removing Survey non-response in R

So, I have a data frame with several continuous variables and several dummy variables. The survey that this data frame comes from uses 6,7,8 and 9 to denote different types of non-response. So, I would like to replace 6,7,8 and 9 with NA whenever they show up in a dummy variable column but leave them be in the continuous variable column.
Is there a concise way to go about doing this?
Here's my data:
> dput(head(sfsuse[c(4:16)]))
structure(list(famsize = c(3L, 1L, 2L, 5L, 3L, 5L), famtype = c(2L,
1L, 2L, 3L, 2L, 3L), cc = c(1L, 1L, 1L, 1L, 1L, 1L), nocc = c(1L,
1L, 1L, 3L, 1L, 1L), pdloan = c(2L, 2L, 2L, 2L, 2L, 2L), help = c(2L,
2L, 2L, 2L, 2L, 2L), budget = c(1L, 1L, 1L, 1L, 2L, 2L), income = c(340000L,
20500L, 0L, 165000L, 95000L, -320000L), govtrans = c(7500L, 15500L,
22000L, 350L, 0L, 9250L), childexp = c(0L, 0L, 0L, 0L, 0L, 0L
), homeown = c(1L, 1L, 1L, 1L, 1L, 2L), bank = c(2000L, 80000L,
25000L, 20000L, 57500L, 120000L), vehval = c(33000L, 7500L, 5250L,
48000L, 8500L, 50000L)), .Names = c("famsize", "famtype", "cc",
"nocc", "pdloan", "help", "budget", "income", "govtrans", "childexp",
"homeown", "bank", "vehval"), row.names = c(NA, 6L), class = "data.frame")
I'm trying to subs in NA for 6,7,8 and 9 in columns 3:7 and column 11. I know how to do this one column at a time by the column names:
df$name[df$name %in% 6:9]<-NA
but I would have to do this for each column by name, is there a concise way to do it by column index?
Thanks
This function should work
f <- function(data,k) {
data[data[,k] %in% 6:9,k] <- NA
data
}
Now at the console:
> for (k in c(3:7,11)) { data <- f(data,k) }

Resources