Filter based on NA in dplyr - r

This is my df
df <- structure(structure(list(group = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L), .Label = c("A", "B", "C", "D", "E"), class = "factor"), y = c(NA, NA, NA, NA, 1, NA, NA, NA, 1, 2, NA, NA, 1, 2, 3, NA, 2, 2, 3, 4, NA, 3, 3, 4, 5), x = c(1L, 2L, 3L, 4L,5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L)), .Names = c("group", "y", "x"), row.names = c(NA, 25L), class = "data.frame"))
> df
group y x
1 A NA 1
2 A NA 2
3 A NA 3
4 A NA 4
5 A 1 5
6 B NA 1
7 B NA 2
8 B NA 3
9 B 1 4
10 B 2 5
11 C NA 1
12 C NA 2
13 C 1 3
14 C 2 4
15 C 3 5
16 D NA 1
17 D 2 2
18 D 2 3
19 D 3 4
20 D 4 5
21 E NA 1
22 E 3 2
23 E 3 3
24 E 4 4
25 E 5 5
My goal is to calculate the mean per x value (across groups), using mutate. But first I'd like to filter the data, such that only those values of x remain for which there are at least 3 non-NA values. So in this example I only want to include those entries for which x is at least 3. I can't figure out how to create the filter(), any suggestions?

You could try
df %>%
group_by(group) %>% #group_by(x) %>% #as per the OP's clarification
filter(sum(!is.na(y))>=3) %>%
mutate(Mean=mean(x, na.rm=TRUE))

Related

Subset in R with specific values for specific columns identified by their index number

If I have a data frame like this:
df = data.frame(A = sample(1:5, 10, replace=T), B = sample(1:5, 10, replace=T), C = sample(1:5, 10, replace=T), D = sample(1:5, 10, replace=T), E = sample(1:5, 10, replace=T))
Giving me this:
A B C D E
1 1 5 1 4 3
2 2 3 5 4 3
3 4 2 2 4 4
4 2 1 2 5 2
5 3 3 4 4 5
6 3 2 3 1 5
7 1 5 4 2 3
8 1 3 5 5 1
9 3 1 1 3 5
10 5 3 1 2 4
How do I get a subset that includes all the rows where the values for certain columns (B and D, say) are equal to 1, with the columns identified by their index numbers (2 and 4) rather than their names? In this case:
A B C D E
4 2 1 2 5 2
6 3 2 3 1 5
9 3 1 1 3 5
df[rowSums(df[c(2,4)] == 1) > 0,]
# A B C D E
# 4 2 1 2 5 2
# 6 3 2 3 1 5
# 9 3 1 1 3 5
You said to compare values by column index, so df[c(2,4)] or (or df[,c(2,4)]).
df[c(2,4)] == 1 returns a matrix of logicals, whether the cell's value is equal to 1.
rowSums(.) > 0 finds those rows with at least one 1.
df[rowSums(.)>0,] selects just those rows.
Data
df <- structure(list(A = c(1L, 2L, 4L, 2L, 3L, 3L, 1L, 1L, 3L, 5L), B = c(5L, 3L, 2L, 1L, 3L, 2L, 5L, 3L, 1L, 3L), C = c(1L, 5L, 2L, 2L, 4L, 3L, 4L, 5L, 1L, 1L), D = c(4L, 4L, 4L, 5L, 4L, 1L, 2L, 5L, 3L, 2L), E = c(3L, 3L, 4L, 2L, 5L, 5L, 3L, 1L, 5L, 4L)), class = "data.frame", row.names = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"))
tidyverse
df <-
structure(
list(
A = c(1L, 2L, 4L, 2L, 3L, 3L, 1L, 1L, 3L, 5L),
B = c(5L, 3L, 2L, 1L, 3L, 2L, 5L, 3L, 1L, 3L),
C = c(1L, 5L, 2L, 2L, 4L, 3L, 4L, 5L, 1L, 1L),
D = c(4L, 4L, 4L, 5L, 4L, 1L, 2L, 5L, 3L, 2L),
E = c(3L, 3L, 4L, 2L, 5L, 5L, 3L, 1L, 5L, 4L)
),
class = "data.frame",
row.names = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10")
)
library(tidyverse)
df %>%
filter(B == 1 | D == 1)
#> A B C D E
#> 4 2 1 2 5 2
#> 6 3 2 3 1 5
#> 9 3 1 1 3 5
Created on 2022-01-23 by the reprex package (v2.0.1)
data.table
library(data.table)
setDT(df)[B == 1 | D == 1, ]
#> A B C D E
#> 1: 2 1 2 5 2
#> 2: 3 2 3 1 5
#> 3: 3 1 1 3 5
Created on 2022-01-23 by the reprex package (v2.0.1)

Grouping a dataframe into matrices based on a variable and transposing

Here is some mock data related to this problem:
structure(list(HHID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 4L, 4L, 4L, 4L, 4L), PERS = c(1L, 2L, 3L, 4L, 5L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 1L, 2L, 3L, 4L, 5L), MARSTAT = c(2L,
2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 5L, 1L, 1L
), SEX = c(1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L,
1L, 2L, 2L, 1L), VAR1 = c(NA, 1L, 4L, 4L, 4L, NA, 1L, 5L, 4L,
NA, 4L, 4L, NA, 1L, 8L, 4L, 4L), VAR2 = c(NA, NA, 4L, 4L, 4L,
NA, NA, 4L, 5L, NA, NA, 6L, NA, NA, 12L, 4L, 4L), VAR3 = c(NA,
NA, NA, 6L, 6L, NA, NA, NA, 7L, NA, NA, NA, NA, NA, NA, 11L,
11L), VAR4 = c(NA, NA, NA, NA, 6L, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 6L), VAR5 = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_), FLAG = c(0L,
0L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L
)), .Names = c("HHID", "PERS", "MARSTAT", "SEX", "VAR1", "VAR2",
"VAR3", "VAR4", "VAR5", "FLAG"), row.names = c(NA, 17L), class = "data.frame")
For each household in my data, I want to transpose the values in the lower triangle into the upper triangle so that for each household I essentially have a symmetrical matrix with the diagonal either NA or 0 (for this analysis, 0 and NA are interchangeable). So based on the above example, I would be looking for the following dataset:
structure(list(HHID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 4L, 4L, 4L, 4L, 4L), PERS = c(1L, 2L, 3L, 4L, 5L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 1L, 2L, 3L, 4L, 5L), MARSTAT = c(2L,
2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 5L, 1L, 1L
), SEX = c(1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L,
1L, 2L, 2L, 1L), VAR1 = c(NA, 1L, 4L, 4L, 4L, NA, 1L, 5L, 4L,
NA, 4L, 4L, NA, 1L, 8L, 4L, 4L), VAR2 = c(1L, NA, 4L, 4L, 4L,
1L, NA, 4L, 5L, 4L, NA, 6L, 1L, NA, 12L, 4L, 4L), VAR3 = c(4L,
4L, NA, 6L, 6L, 5L, 4L, NA, 7L, 4L, 6L, NA, 8L, 12L, NA, 11L,
11L), VAR4 = c(4L, 4L, 6L, NA, 6L, 4L, 5L, 7L, NA, NA, NA, NA,
4L, 4L, 11L, NA, 6L), VAR5 = c(4L, 4L, 6L, 6L, NA, NA, NA, NA,
NA, NA, NA, NA, 4L, 4L, 11L, 6L, NA), FLAG = c(0L, 0L, 0L, 1L,
0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 4L, 4L, 11L, 1L, 1L)), .Names = c("HHID",
"PERS", "MARSTAT", "SEX", "VAR1", "VAR2", "VAR3", "VAR4", "VAR5",
"FLAG"), class = "data.frame", row.names = c(NA, -17L))
I have been able to do this for one household, as follows (though it misses the HHID which I would need to distinguish between households):
HH1 <- df %>%
filter(HHID == 1) %>%
select(VAR1, VAR2, VAR3, VAR4, VAR5)
HH1 <- as.matrix(HH1)
HH1[is.na(HH1)] <- 0
T_HH1 <- t(HH1)
T_HH1[is.na(T_HH1)] <- 0
combo <- HH1 + T_HH1
A <- combo
However, how would I go about doing this for multiple households across my dataset, also keeping the "HHID" and "PERS" information so that I can link on any extra info if needed?
Thank you so much in advance!
One approach is:
Split your data frame by HHID into groups
Create a custom function to take VAR columns, make it a square matrix, and transpose
Use rbindlist to reconstruct into rows again using fill to add NA as lengths in the list differ
Replace VAR columns (5 through 9) with new VAR columns
Let me know if this works for you.
f <- function(m) {
m <- m[, 1:nrow(m)]
m[upper.tri(m)] <- t(m)[upper.tri(m)]
m
}
df1[,5:9] <- rbindlist(lapply(split(df1[,5:9], df1$HHID), f), fill = TRUE)
Output
HHID PERS MARSTAT SEX VAR1 VAR2 VAR3 VAR4 VAR5 FLAG
1 1 1 2 1 NA 1 4 4 4 0
2 1 2 2 2 1 NA 4 4 4 0
3 1 3 1 2 4 4 NA 6 6 0
4 1 4 1 1 4 4 6 NA 6 1
5 1 5 1 1 4 4 6 6 NA 0
6 2 1 2 2 NA 1 5 4 NA 0
7 2 2 2 1 1 NA 4 5 NA 0
8 2 3 1 2 5 4 NA 7 NA 1
9 2 4 1 1 4 5 7 NA NA 1
10 3 1 1 2 NA 4 4 NA NA 0
11 3 2 1 2 4 NA 6 NA NA 1
12 3 3 1 1 4 6 NA NA NA 0
13 4 1 2 2 NA 1 8 4 4 0
14 4 2 2 1 1 NA 12 4 4 0
15 4 3 5 2 8 12 NA 11 11 0
16 4 4 1 2 4 4 11 NA 6 1
17 4 5 1 1 4 4 11 6 NA 1
additional solution
library(purrr)
library(tidyverse)
df %>%
mutate_all(~ replace_na(., 0)) %>%
select(HHID, starts_with("VAR")) %>%
group_by(HHID) %>%
nest %>%
mutate(data = map(data, ~ .x + t(.x))) %>%
unnest(data) %>%
bind_cols(select(df, -starts_with("VAR"), -HHID))
You can split the data on the HHID, apply an anonymous function to do the matrix stuff, then unsplit it.
vars <- grep("^VAR", names(df))
df[, vars] <- unsplit(lapply(split(df[, vars], df$HHID), tt), df$HHID)
# HHID PERS MARSTAT SEX VAR1 VAR2 VAR3 VAR4 VAR5 FLAG
# 1 1 1 2 1 0 1 4 4 4 0
# 2 1 2 2 2 1 0 4 4 4 0
# 3 1 3 1 2 4 4 0 6 6 0
# 4 1 4 1 1 4 4 6 0 6 1
# 5 1 5 1 1 4 4 6 6 0 0
# 6 2 1 2 2 0 1 5 4 0 0
# 7 2 2 2 1 1 0 4 5 0 0
# 8 2 3 1 2 5 4 0 7 0 0
# 9 2 4 1 1 4 5 7 0 0 0
# 10 3 1 1 2 0 4 4 0 0 0
# 11 3 2 1 2 4 0 6 0 0 0
# 12 3 3 1 1 4 6 0 0 0 0
# 13 4 1 2 2 0 1 8 4 4 0
# 14 4 2 2 1 1 0 12 4 4 0
# 15 4 3 5 2 8 12 0 11 11 0
# 16 4 4 1 2 4 4 11 0 6 1
# 17 4 5 1 1 4 4 11 6 0 1
Here's the anonymous function:
tt <- function(x) {
x <- x[, 1:nrow(x)] # Make it square
x[upper.tri(x)] <- 0 # replace upper triangle with 0
x + t(x) # add them together
}

Get sum of unique rows in table function in R

Suppose I have data which looks like this
Id Name Price sales Profit Month Category Mode Supplier
1 A 2 5 8 1 X K John
1 A 2 6 9 2 X K John
1 A 2 5 8 3 X K John
2 B 2 4 6 1 X L Sam
2 B 2 3 4 2 X L Sam
2 B 2 5 7 3 X L Sam
3 C 2 5 11 1 X M John
3 C 2 5 11 2 X L John
3 C 2 5 11 3 X K John
4 D 2 8 10 1 Y M John
4 D 2 8 10 2 Y K John
4 D 2 5 7 3 Y K John
5 E 2 5 9 1 Y M Sam
5 E 2 5 9 2 Y L Sam
5 E 2 5 9 3 Y M Sam
6 F 2 4 7 1 Z M Kyle
6 F 2 5 8 2 Z L Kyle
6 F 2 5 8 3 Z M Kyle
if I apply table function, it will just combines are the rows and result will be
K L M
X 4 4 1
Y 2 1 3
Z 0 1 2
Now what if I want not the sum of all rows but only sum of those rows with Unique Id
so it looks like
K L M
X 2 2 1
Y 1 1 2
Z 0 1 1
Thanks
If df is your data.frame:
# Subset original data.frame to keep columns of interest
df1 <- df[,c("Id", "Category", "Mode")]
# Remove duplicated rows
df1 <- df1[!duplicated(df1),]
# Create table
with(df1, table(Category, Mode))
# Mode
# Category K L M
# X 2 2 1
# Y 1 1 2
# Z 0 1 1
Or in one line using unique
table(unique(df[c("Id", "Category", "Mode")])[-1])
df <- structure(list(Id = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L,
4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L), Name = structure(c(1L, 1L, 1L,
2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L), .Label = c("A",
"B", "C", "D", "E", "F"), class = "factor"), Price = c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), sales = c(5L, 6L, 5L, 4L, 3L, 5L, 5L, 5L, 5L, 8L, 8L, 5L,
5L, 5L, 5L, 4L, 5L, 5L), Profit = c(8L, 9L, 8L, 6L, 4L, 7L, 11L,
11L, 11L, 10L, 10L, 7L, 9L, 9L, 9L, 7L, 8L, 8L), Month = c(1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L), Category = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L), .Label = c("X", "Y", "Z"
), class = "factor"), Mode = structure(c(1L, 1L, 1L, 2L, 2L,
2L, 3L, 2L, 1L, 3L, 1L, 1L, 3L, 2L, 3L, 3L, 2L, 3L), .Label = c("K",
"L", "M"), class = "factor"), Supplier = structure(c(1L, 1L,
1L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 2L, 2L, 2L
), .Label = c("John", "Kyle", "Sam"), class = "factor")), .Names = c("Id",
"Name", "Price", "sales", "Profit", "Month", "Category", "Mode",
"Supplier"), class = "data.frame", row.names = c(NA, -18L))
We can try
library(data.table)
dcast(unique(setDT(df1[c('Category', 'Mode', 'Id')])),
Category~Mode, value.var='Id', length)
# Category K L M
#1: X 2 2 1
#2: Y 1 1 2
#3: Z 0 1 1
Or with dplyr
library(dplyr)
df1 %>%
distinct(Id, Category, Mode) %>%
group_by(Category, Mode) %>%
tally() %>%
spread(Mode, n, fill=0)
# Category K L M
# (chr) (dbl) (dbl) (dbl)
#1 X 2 2 1
#2 Y 1 1 2
#3 Z 0 1 1
Or as #David Arenburg suggested, a variant of the above is
df1 %>%
distinct(Id, Category, Mode) %>%
select(Category, Mode) %>%
table()

Rescaling by group across data frames

I have two data frames
df1 <- structure(list(g1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor"), g2 = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L), .Label = c("a", "b", "c"), class = "factor"), val1 = 1:20, val2 = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 4L, 1L, 2L, 3L)), .Names = c("g1", "g2", "val1", "val2"), row.names = c(NA, -20L), class = "data.frame")
df2 <- structure(list(g1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor"), g2 = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L), .Label = c("a", "b", "c"), class = "factor"), val3 = c(5L, 6L, 7L, 3L, 4L, 5L, 2L, 3L, 4L, 8L, 9L, 10L, 4L, 5L, 6L, 5L, 6L)), .Names = c("g1", "g2", "val3"), row.names = c(NA, -17L), class = "data.frame")
> df1
g1 g2 val1 val2
1 A a 1 1
2 A a 2 2
3 A a 3 3
4 A a 4 4
5 A b 5 1
6 A b 6 2
7 A b 7 3
8 A c 8 1
9 A c 9 2
10 A c 10 3
11 B a 11 1
12 B a 12 2
13 B a 13 3
14 B b 14 1
15 B b 15 2
16 B b 16 3
17 B b 17 4
18 B c 18 1
19 B c 19 2
20 B c 20 3
> df2
g1 g2 val3
1 A a 5
2 A a 6
3 A a 7
4 A b 3
5 A b 4
6 A b 5
7 A c 2
8 A c 3
9 B c 4
10 B a 8
11 B a 9
12 B a 10
13 B b 4
14 B b 5
15 B b 6
16 B c 5
17 B c 6
My aim is to rescale df1$val2 to take values between the min and max values of df2$val3 within the respective groups.
I tried this:
library(dplyr)
df1 <- df1 %.% group_by(g1, g2) %.% mutate(rescaled=(max(df2$val3)-min(df2$val3))*(val2-min(val2))/(max(val2)-min(val2))+min(df2$val3))
But the output is different from what I expect. The problem is that I can neither cbind nor merge the two data frames due to their different lengths. Any hints?
Does this work?
library(plyr)
df3 <- ddply(df2, .(g1, g2), summarize, max.val=max(val3), min.val=min(val3))
merged.df <- merge(df1, df3, by=c("g1", "g2"), all.x=TRUE)
## Now rescale merged.df$val2 as desired

Index a data frame row-by-row using column names selected from a variable

Consider the following data frame:
TEST <- structure(list(Value = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
Select = structure(c(2L, 1L, 3L, 2L, 2L, 1L, 1L,
2L, 1L, 1L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"),
A = c(5L, 5L, 4L, 3L, 4L, 3L, 5L, 3L, 3L, 4L, 5L, 4L),
B = c(10L, 8L, 7L, 6L, 3L, 8L, 8L, 7L, 8L, 9L, 11L, 8L),
C = c(0L, 1L, 3L, 2L, 0L, 3L, 0L, 2L, 0L, 1L, 1L, 0L)),
.Names = c("Value", "Select", "A", "B", "C"),
row.names = c(NA, -12L),
class = "data.frame")
I want to efficiently assign the Value column, on a row-by-row basis, from the set of columns A, B and C based on the Select column.
For example, in row 1 I want Value to be equal to the element in column B - i.e. Value[1]=10.
My current method is to use a for loop:
for( idx in 1:nrow(TEST) ) {
TEST$Value[idx] <- TEST[ idx, as.character(TEST$Select[idx]) ]
}
Which results in the desired output:
Value Select A B C
1 10 B 5 10 0
2 5 A 5 8 1
3 3 C 4 7 3
4 6 B 3 6 2
5 3 B 4 3 0
6 3 A 3 8 3
7 5 A 5 8 0
8 7 B 3 7 2
9 3 A 3 8 0
10 4 A 4 9 1
11 1 C 5 11 1
12 0 C 4 8 0
Is there a more efficient or alternative way of doing this? I feel like this is some sort of merge() or table join type operation.
P.S. I wasn't quite sure how to describe this operation - any suggestions for a better question/description also welcome.
I would use matrix indexing and match. That approach is vectorized, hence much faster than a for or apply loop would give you:
L <- c("A", "B", "C")
TEST$Value <- TEST[L][cbind(seq_len(nrow(TEST)), match(TEST$Select, L))]
If you are not familiar with matrix indexing, it is documented inside ?"[":
A third form of indexing is via a numeric matrix with the one column for each dimension: each row of the index matrix then selects a single element of the array, and the result is a vector

Resources