Removing factor levels from variable X based on values in Y - r

I have a larger data frame with many factor levels. I would like to remove those levels for which all corresponding Y values are zero.
An example data set:
df <- structure(list(X = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L), .Label = c("A",
"B", "C", "D", "E"), class = "factor"), Y = c(1L, 2L, 0L, 2L,
0L, 0L, 0L, 0L, 2L, 5L, 1L, 1L, 0L, 0L, 1L, 8L, 0L, 0L, 0L, 0L
)), .Names = c("X", "Y"), class = "data.frame", row.names = c(NA,
-20L))
For this example, I would like to have the rows containing B and E removed.

We can group by 'X' and filter for rows that have any value in 'Y' not equal to 0
library(dplyr)
df %>%
group_by(X) %>%
filter(any(Y != 0))
Or use the all with negate (!)
df %>%
group_by(X) %>%
filter(!all(Y==0))

You can do it in base R
df[df$X%in%df$X[df$Y!=0],]
X Y
1 A 1
2 A 2
3 A 0
4 A 2
9 C 2
10 C 5
11 C 1
12 C 1
13 D 0
14 D 0
15 D 1
16 D 8

Related

Compute proportion of outcome from repeated measures design

I have a table in the following format:
CowId Result IMI
1 S. aureus 1
1 No growth 0
2 No growth 0
2 No growth 0
3 E. coli 1
3 No growth 0
3 E. coli 0
4 Bacillus sp. 1
4 Contaminated 0
From this table, I would like to compute the proportion of CowIds that are negative for an IMI (0 = negative; 1 = positive) at all sampling time points.
In this example, 25% of cows [CowId = 2] tested negative for an IMI at all sampling time points.
To compute this proportion, my initial approach was to group each CowId, then compute the difference between the number of negative IMIs and the total number of IMI tests, where a resulting value of 0 would indicate that the cow was negative for an IMI at all time points.
As of now, my code computes this for each individual CowId. How can I augment this to compute the proportion described above?
fp %>%
filter(Result != "Contaminated") %>%
group_by(CowId) %>%
summarise(negative = (sum(IMI == 0) - length(IMI)))
We can count how many CowId's have tested negative at all points and calculate their ratio.
library(dplyr)
fp %>%
filter(Result != "Contaminated") %>%
group_by(CowId) %>%
summarise(negative = all(IMI == 0)) %>%
summarise(total_percent = mean(negative) * 100)
# total_percent
# <dbl>
#1 25
In base R, we can use aggregate
temp <- aggregate(IMI~CowId, subset(fp, Result != "Contaminated"),
function(x) all(x == 0))
mean(temp$IMI) * 100
data
fp <- structure(list(CowId = c(1L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 4L),
Result = structure(c(5L, 4L, 4L, 4L, 3L, 4L, 3L, 1L, 2L), .Label =
c("Bacillus_sp.","Contaminated", "E.coli", "No_growth", "S.aureus"),
class = "factor"),IMI = c(1L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L)),
class = "data.frame", row.names = c(NA, -9L))
With data.table
library(data.table)
setDT(fp)[Result != "Contaminated", .(negative = all(IMI == 0)),
.(CowId)][, .(total_percent = mean(negative)* 100 )]
# total_percent
#1: 25
data
fp <- structure(list(CowId = c(1L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 4L),
Result = structure(c(5L, 4L, 4L, 4L, 3L, 4L, 3L, 1L, 2L), .Label =
c("Bacillus_sp.","Contaminated", "E.coli", "No_growth", "S.aureus"),
class = "factor"),IMI = c(1L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L)),
class = "data.frame", row.names = c(NA, -9L))

Counting incidences from one data frame, entering results into a different data frame

I have two data frames: households and individuals.
This is households:
structure(list(ID = 1:5), class = "data.frame", row.names = c(NA,
-5L))
This is individuals:
structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 4L, 4L, 4L, 4L, 5L, 5L), Yesno = c(1L, 0L, 1L, 0L, 0L, 0L,
1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-17L))
I'm trying to to add a new column to households that counts the number of times variable Yesno is equal to 1, grouping results by ID.
I have tried
households$Count <- as.numeric(ave(individuals$Yesno[individuals$Yesno == 1], households$ID, FUN = count))
households should look like this:
ID Count
1 2
2 3
3 0
4 2
5 1
Option 1: In base R
Using merge and aggregate
aggregate(Yesno ~ ID, merge(households, individuals), FUN = sum)
# ID Yesno
#1 1 2
#2 2 3
#3 3 0
#4 4 2
#5 5 1
Option 2: With dplyr
Using left_join and group_by+summarise
library(dplyr)
left_join(households, individuals) %>%
group_by(ID) %>%
summarise(Count = sum(Yesno))
#Joining, by = "ID"
## A tibble: 5 x 2
# ID Count
# <int> <int>
#1 1 2
#2 2 3
#3 3 0
#4 4 2
#5 5 1
Option 3: With data.table
library(data.table)
setDT(households)
setDT(individuals)
households[individuals, on = "ID"][, .(Count = sum(Yesno)), by = ID]
# ID Count
#1: 1 2
#2: 2 3
#3: 3 0
#4: 4 2
#5: 5 1
Sample data
households <- structure(list(ID = 1:5), class = "data.frame", row.names = c(NA,
-5L))
individuals <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 4L, 4L, 4L, 4L, 5L, 5L), Yesno = c(1L, 0L, 1L, 0L, 0L, 0L,
1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-17L))
Another base R approach using sapply is to loop over each ID in households and subset that ID from individuals and count how many of them have 1 in Yesno column.
households$Count <- sapply(households$ID, function(x)
sum(individuals$Yesno[individuals$ID == x] == 1))
households
# ID Count
#1 1 2
#2 2 3
#3 3 0
#4 4 2
#5 5 1
The == 1 part in the function can be removed if the Yesno column has only 0's and 1's.

print group in R by condition

Say, I have data
data=structure(list(x1 = structure(c(1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L), .Label = c("q",
"r", "w"), class = "factor"), x2 = structure(c(2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("e", "w"), class = "factor"), x3 = structure(c(1L,
1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L,
2L, 2L, 2L), .Label = c("e", "q", "r"), class = "factor"), var = c(34L,
35L, 34L, 34L, 34L, 34L, 35L, 34L, 34L, 34L, 34L, 35L, 34L, 34L,
34L, 34L, 34L, 34L, 34L, 34L), act = c(1L, 1L, 1L, 1L, 1L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("x1",
"x2", "x3", "var", "act"), class = "data.frame", row.names = c(NA,
-20L))
by columns x1, x2, x3 we have three groups
q w e
w e r
r e q
I must print only those groups which for act column have only 1 value.
In this case, only w e r group has by act column both 0 and 1 value, and
another group q w e and r e q has only 1 by act column, so I need to print it.
How to do it?
Expected output
x1 x2 x3
q w e
r e q
library(dplyr)
data %>% distinct(x1,x2,x3, .keep_all = TRUE) %>%
filter(act==1) %>% select(-var,-act)
x1 x2 x3
1 q w e
2 r e q
data %>% distinct(x1,x2,x3, .keep_all = TRUE) %>%
filter(act==1) %>% select(-var,-act) %>%
filter(x1=='r',x2=='e',x3=='q')
x1 x2 x3
1 r e q
#OR
data %>% filter(x1=='r',x2=='e',x3=='q')
If I understand correctly, the OP has requested to extract/print only those groups which contain only the value 1 in the act column.
This can be achieved using the all() function.
data.table
library(data.table)
setDT(data)[, which(all(act == 1L)), by = .(x1, x2, x3)][, -"V1"]
x1 x2 x3
1: q w e
2: r e q
dplyr
library(dplyr)
data %>%
group_by(x1, x2, x3) %>%
filter(all(act == 1L)) %>%
distinct(x1, x2, x3)
# A tibble: 2 x 3
# Groups: x1, x2, x3 [2]
x1 x2 x3
<fct> <fct> <fct>
1 q w e
2 r e q
Data
In addition to the dataset data provided by the OP I have tested my answer with a second dataset which contains an additional group and where the rows in the w e r group are ordered differently.
data2 <- data.table::fread(
" x1 x2 x3 var act
a b c 33 0
a b c 33 0
q w e 34 1
q w e 35 1
w e r 34 1
w e r 35 1
w e r 34 0
w e r 35 0
r e q 34 1
r e q 34 1
", data.table = FALSE)

Get the rows that include specific strings

I have a data like this
df<- structure(list(Groups = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
No = c(8L, 4L, 9L, 2L, 7L, 3L, 5L, 1L, 2L), NO1 = c(1L, 1L,
1L, 2L, 1L, 1L, 1L, 1L, 1L), Accessions = structure(c(6L,
5L, 1L, 3L, 2L, 7L, 4L, 9L, 8L), .Label = c("E9PCL5", "P00367",
"P05783", "P63104", "Q6DD88", "Q6FI13", "Q6P597-3", "Q7Z406-6",
"Q9BUA3"), class = "factor"), Accessions2 = structure(c(6L,
2L, 1L, 4L, 3L, 7L, 5L, 9L, 8L), .Label = c("B4DIW5; F8WA69; P56945; E9PCV2; F5GXA2; F5H855; E9PCL5; F5GXV6; F5H7Z0",
"F5GWF8; F5H6I7; Q6DD88", "P00367; B3KV55; B4DGN5; P49448; F5GYQ4; H0YFJ0; F8WA20",
"P05783; F8VZY9", "P63104; E7EX24; H0YB80; B0AZS6; B7Z2E6",
"Q16777; Q99878; F8WA69; H0YFX9; Q9BTM1; P20671; P0C0S8; Q6FI13",
"Q6P597-3; Q6P597-2; Q6P597", "Q7Z406-2; Q7Z406-6", "Q9BUA3"
), class = "factor"), NO3 = c(1L, 0L, 0L, 0L, 1L, 0L, 1L,
0L, 0L)), .Names = c("Groups", "No", "NO1", "Accessions",
"Accessions2", "NO3"), class = "data.frame", row.names = c(NA,
-9L))
I am trying to find those row that have specific strings in Accession2 column and then sum up the NO1
For example, I want to know F8WA69 and Q9BUA3 exist in which rows
So It it will be
Groups No NO1 Accessions Accessions2 NO3
1 8 1 Q6FI13 Q16777; Q99878; F8WA69; H0YFX9; Q9BTM1; P20671; P0C0S8; Q6FI13 1
1 9 1 E9PCL5 B4DIW5; F8WA69; P56945; E9PCV2; F5GXA2; F5H855; E9PCL5; F5GXV6; F5H7Z0 0
and
Groups No NO1 Accessions Accessions2 NO3
1 3 1 Q6P597-3 Q6P597-3; Q6P597-2; Q9BUA3 0
1 1 1 Q9BUA3 Q9BUA3 0
1 2 1 Q7Z406-6 Q7Z406-2; Q9BUA3 0
Then the sum up the NO1 for each of them
The first one is 2 and the second one is 3
You can use simple grepl or grep to find rows where ID is present.
For example:
grepl("F8WA69", df$Accessions2)
[1] TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
To subset your data use:
df[grepl("F8WA69", df$Accessions2), ]
And if you want to iterate over multiple ID's and sum NO1 you can use sapply:
sapply(c("F8WA69", "Q9BUA3"),
function(x) sum(df[grepl(x, df$Accessions2), ]$NO1))
F8WA69 Q9BUA3
2 1

Replace NA values in dataframe variable with values from other dataframe by "ID"

I would like to know if there is a more concise way to replace NA values for a variable in a dataframe than what I did below. The code below seems to be longer than what I think might be possible in R. For example, I am unaware of some package/tool that might do this more succinctly.
Is there a way to replace, or merge values only if they are NA? After merging two dataframes using all.x = T I have some NA values, I'd like to replace those with information from another dataframe using a common variable to link the replacement.
# get dataframes
breaks <- structure(list(Break = 1:11, Value = c(2L, 13L, 7L, 9L, 40L,
21L, 10L, 37L, 7L, 26L, 42L)), .Names = c("Break", "Value"), class = "data.frame", row.names = c(NA,
-11L))
fsites <- structure(list(Site = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L,
3L, 3L, 3L, 3L), Plot = c(0L, 1L, 2L, 3L, 4L, 0L, 1L, 2L, 0L,
1L, 2L, 3L, 4L, 5L), Break = c(1L, 5L, 7L, 8L, 11L, 1L, 6L, 11L,
1L, 4L, 6L, 8L, 9L, 11L)), .Names = c("Site", "Plot", "Break"
), class = "data.frame", row.names = c(NA, -14L))
bps <- structure(list(Site = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L,
3L), Plot = c(0L, 1L, 2L, 3L, 1L, 2L, 0L, 1L, 2L, 3L, 4L), Value = c(0.393309653,
0.12465733, 0.27380161, 0.027288989, 0.439712533, 0.289724079,
0.036429062, 0.577460008, 0.820375917, 0.323217357, 0.28637503
)), .Names = c("Site", "Plot", "Value"), class = "data.frame", row.names = c(NA,
-11L))
# merge fsites and bps
df1 <- merge(fsites, bps, by=c("Site", "Plot"), all.x=T)
# merge df1 and breaks to get values to eventually replace the NA values in
# df1$Values.x, here "Break" is the ID by which to replace the NA values
df2 <- merge(df1, breaks, by=c("Break"))
# Create a new column 'Value' that uses Value.x, unless NA, then Value.y
df3 <- df2
df3$Value <- df2$Value.x
df2.na <- is.na(df2$Value.x)
df3$Value[df2.na] <- df2$Value.y[df2.na]
# get rid of unnecessary columns
cols <- c(1:3,6)
df4 <- df3[,cols]
At the stage where there is only (breaks, fsites, bps and) df1 around:
df1$Value <- ifelse(is.na(df1$Value),
breaks$Value[match(df1$Break, breaks$Break)], df1$Value)
#> df1
# Site Plot Break Value
#1 1 0 1 0.39330965
#2 1 1 5 0.12465733
#3 1 2 7 0.27380161
#4 1 3 8 0.02728899
#5 1 4 11 42.00000000
#6 2 0 1 2.00000000
#7 2 1 6 0.43971253
#8 2 2 11 0.28972408
#9 3 0 1 0.03642906
#10 3 1 4 0.57746001
#11 3 2 6 0.82037592
#12 3 3 8 0.32321736
#13 3 4 9 0.28637503
#14 3 5 11 42.00000000
#just to test with your `df4`
> sort(df1$Value) == sort(df4$Value)
[1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

Resources