Element-wise subsetting in R - r

I am trying to print specific observations from a data frame. Consider this simple example:
df <- data.frame(ID = c(1,1,1,2,2,2,3,3),
Week = c(1,2,2,1,1,2,1,1),
Y = c(4,2,6,7,5,3,1,9))
I would like to (only) print the rows where (ID = 1 & Week = 2), (ID = 2 & Week = 1) as well as (ID = 3 & Week = 1), giving this output:
rbind(df[(df$ID == 1) & (df$Week == 2),],
df[(df$ID == 2) & (df$Week == 1),],
df[(df$ID == 3) & (df$Week == 1),])
The values to be used for indexing are stored in a vector for each variable:
IDidx <- c(1,2,3)
Weekidx <- c(2,1,1)
Is there any solution that takes these vectors and indexes element-wise from them as I have done it "manually" using rbind()?
Thanks for your help!

We can create a data frame based on IDidx and Weekidx, and then use the semi_join from the dplyr package.
inx <- data.frame(ID = IDidx, Week = Weekidx)
library(dplyr)
df %>% semi_join(inx, by = c("ID", "Week"))
# ID Week Y
# 1 1 2 2
# 2 1 2 6
# 3 2 1 7
# 4 2 1 5
# 5 3 1 1
# 6 3 1 9

Related

Random Sample From a Dataframe With Specific Count

This question is probably best illustrated with an example.
Suppose I have a dataframe df with a binary variable b (values of b are 0 or 1). How can I take a random sample of size 10 from this dataframe so that I have 2 instances where b=0 in the random sample, and 8 instances where b=1 in the dataframe?
Right now, I know that I can do df[sample(nrow(df),10,] to get part of the answer, but that would give me a random amount of 0 and 1 instances. How can I specify a specific amount of 0 and 1 instances while still taking a random sample?
Here's an example of how I'd do this... take two samples and combine them. I've written a simple function so you can "just take one sample."
With a vector:
pop <- sample(c(0,1), 100, replace = TRUE)
yoursample <- function(pop, n_zero, n_one){
c(sample(pop[pop == 0], n_zero),
sample(pop[pop == 1], n_one))
}
yoursample(pop, n_zero = 2, n_one = 8)
[1] 0 0 1 1 1 1 1 1 1 1
Or, if you are working with a dataframe with some unique index called id:
# Where d1 is your data you are summarizing with mean and sd
dat <- data.frame(
id = 1:100,
val = sample(c(0,1), 100, replace = TRUE),
d1 = runif(100))
yoursample <- function(dat, n_zero, n_one){
c(sample(dat[dat$val == 0,"id"], n_zero),
sample(dat[dat$val == 1,"id"], n_one))
}
sample_ids <- yoursample(dat, n_zero = 2, n_one = 8)
sample_ids
mean(dat[dat$id %in% sample_ids,"d1"])
sd(dat[dat$id %in% sample_ids,"d1"])
Here is a suggestion:
First create a sample of 0 and 1 with id column.
Then sample 2:8 df's with condition and bind them together:
library(tidyverse)
set.seed(123)
df <- as_tibble(sample(0:1,size=50,replace=TRUE)) %>%
mutate(id = row_number())
df1 <- df[ sample(which (df$value ==0) ,2), ]
df2 <- df[ sample(which (df$value ==1), 8), ]
df_final <- bind_rows(df1, df2)
value id
<int> <int>
1 0 14
2 0 36
3 1 21
4 1 24
5 1 2
6 1 50
7 1 49
8 1 41
9 1 28
10 1 33
library(tidyverse)
set.seed(123)
df <- data.frame(a = letters,
b = sample(c(0,1),26,T))
bind_rows(
df %>%
filter(b == 0) %>%
sample_n(2),
df %>%
filter(b == 1) %>%
sample_n(8)
) %>%
arrange(a)
a b
1 d 1
2 g 1
3 h 1
4 l 1
5 m 1
6 o 1
7 p 0
8 q 1
9 s 0
10 v 1

Detect sequences of ordered strings and group them using R

I have a string vector with about 500K elements in it and I want to assign a value to each of the element to show the group number of each element.
The grouping criteria goes like this:
a group number is assigned consecutively from the top of the list
Each element should be assigned different groups unless if a minimum of 3 consecutive elements are in ascending alphabetical order, in which these consecutive elements will be in one group.
How do I do this in R?
For example and expected output:
> my_strings <- c("xx1", "1xxx", "abc.xyz", "a", "ad022", "ghj1", "kf1", "991r",
+ "jdd", "12vd", "r34o", "z", "034mh")
> expected_output <- c(1, 2, 3, 4, 4, 4, 4, 5, 6, 7, 7, 7, 8)
> (df <- data.frame(input = my_strings, output = expected_output))
input output
1 xx1 1
2 1xxx 2
3 abc.xyz 3
4 a 4
5 ad022 4
6 ghj1 4
7 kf1 4
8 991r 5
9 jdd 6
10 12vd 7
11 r34o 7
12 z 7
13 034mh 8
So far, I attempt to use dplyr::lead and assign order based on two consecutive elements. I don't know how to proceed from here though.
res <- as_tibble(my_strings) %>%
mutate(after = lead(my_strings))
res$pre_group = apply(res, 1, function(x) order(c(x[1], x[2]))[2])
(Dang, this was a tough one :-)
tidyverse
library(dplyr)
df %>%
mutate(r1 = cumsum(c(TRUE, diff(rank(input)) < 0)) + 0) %>%
group_by(r1) %>%
mutate(r2 = r1 + seq(0, 0.9*(n() < 3), len = n()) / n()) %>%
ungroup() %>%
mutate(r1 = with(list(rl = rle(r2)$lengths), rep(seq_along(rl), times = rl))) %>%
select(-r2)
# # A tibble: 13 x 3
# input output r1
# <chr> <dbl> <int>
# 1 xx1 1 1
# 2 1xxx 2 2
# 3 abc.xyz 3 3
# 4 a 4 4
# 5 ad022 4 4
# 6 ghj1 4 4
# 7 kf1 4 4
# 8 991r 5 5
# 9 jdd 6 6
# 10 12vd 7 7
# 11 r34o 7 7
# 12 z 7 7
# 13 034mh 8 8
(The lengthy with(...) in the mutate is just an inline version of data.table::rleid.)
data.table
library(data.table)
as.data.table(df)[
, r1 := cumsum(c(TRUE, diff(rank(input)) < 0)) + 0 ][
, r1 := r1 + seq(0, 0.9*(.N < 3), len = .N), by = .(r1) ][
, r1 := rleid(r1) ]
If you want to blur the lines of R-dialects a little, then
library(data.table)
library(magrittr)
as.data.table(df) %>%
.[, r1 := cumsum(c(TRUE, diff(rank(input)) < 0)) + 0 ] %>%
.[, r1 := r1 + seq(0, 0.9*(.N < 3), len = .N), by = .(r1) ] %>%
.[, r1 := rleid(r1) ]
Notes:
... + 0 is short-hand for as.numeric(...). This is because data.table enforces the column's original class when updating a column; since the first definition of r1 (without +0) would be integer, the next reassignment of r1 returns numeric. However, since data.table persists the original class, the numbers will be coerced (truncated) to integer and my efforts halted.
seq(0, 0.9*(...)) reduces to seq(0,0) when there are three or more in a group, which results in a no-op on that group. (This uses dplyr's n() and data.table's .N for group-size.)
the implementations differ slightly because dplyr prohibits modifying the grouping variable(s); data.table has no issue with this. (I'm not certain which direction is correct or better ...)
Not nearly as good as r2evans', but also seems to give the result.
x <- my_strings
n <- length(x)
c(FALSE,x[-1L] > x[-n]) &
c(FALSE,FALSE,x[-1L][-1L] > x[-n][-(n-1)]) &
c(FALSE,FALSE,FALSE,x[-1L][-1L][-1L] > x[-n][-(n-1)][-(n-2)])
(lead(x, 1) > x & lead(x,2) > lead(x,1)) |
(lag(x, 1) < x & lead(x,1) > x) |
(lag(x, 1) < x & lag(x,2) < lag(x,1)) -> condition
condition[is.na(condition)] <- FALSE # remove NAs
#to visualize
tibble(lag(x,2), lag(x,1), x, lead(x,1), lead(x,2), condition)
# There may be a better way than a loop
cur_class <- 0
classes <- integer(n)
for(i in 1:(n)){
if(!condition[i]){ #not in a sequence
cur_class <- cur_class + 1
classes[i] <- cur_class
} else if(!condition[i-1]){ #first of a sequence
cur_class <- cur_class + 1
classes[i] <- cur_class
} else{ #mid-sequence
classes[i] <- cur_class
}
}
tibble(x, classes, condition*1L)
# A tibble: 13 x 3
# x classes `condition * 1L`
# <chr> <dbl> <int>
# 1 xx1 1 0
# 2 1xxx 2 0
# 3 abc.xyz 3 0
# 4 a 4 1
# 5 ad022 4 1
# 6 ghj1 4 1
# 7 kf1 4 1
# 8 991r 5 0
# 9 jdd 6 0
# 10 12vd 7 1
# 11 r34o 7 1
# 12 z 7 1
# 13 034mh 8 0

Filtering in tidyverse based on a vector/list of possible values

I would like to select rows of a data frame based on conditions on two columns that should identify a unique row. In the concrete example below I would like to select
id=1,2,3... with a specific mtry value specified in a vector, i.e. For id=1, I just want the first line with mtry=3, for id=2 I would like mtry=5.
I tried using group_by and using filter e.g.
filter(df, (mtry,id) %in% c([3,1],[5,2],[3,3]))
but this gives an error
Error: unexpected ',' in .
What is the tidyverse way of doing this?
You can do this kind of filter with an inner join
library(dplyr)
df %>%
inner_join(tibble(mtry = c(3, 5, 3), id = c(1, 2, 3)))
Example:
set.seed(100)
df <- data.frame(mtry = sample(1:3, 100, T), id = sample(1:5, 100, T))
df %>%
inner_join(tibble(mtry = c(3, 5, 3), id = c(1, 2, 3)))
# Joining, by = c("mtry", "id")
# mtry id
# 1 3 1
# 2 3 3
# 3 3 3
# 4 3 3
# 5 3 1
# 6 3 3
# 7 3 1
# 8 3 1
# 9 3 1
# 10 3 3
# 11 3 1
# 12 3 3
# 13 3 1
You need to create different conditions for each combination
subset(df, (mtry == 3 & id == 1) | (mtry == 5 & id == 2) | (mtry == 3 & id == 3))
Or if you want tidyverse put the conditions in filter
library(dplyr)
df %>% filter((mtry == 3 & id == 1) | (mtry == 5 & id == 2) | (mtry == 3 & id == 3))
You can combine condition 1 and 3 to do
df %>% filter((mtry == 3 & id %in% c(1, 3)) | (mtry == 5 & id == 2))

Removing groups from dataframe if variable has repeated values

I would like to ask if there is a way of removing a group from dataframe using dplyr (or anz other way in that matter) in the following way. Lets say I have a dataframe in the following form grouped by variable 1:
Variable 1 Variable 2
1 a
1 b
2 a
2 a
2 b
3 a
3 c
3 a
... ...
I would like to remove only groups that have in Variable 2 two consecutive same values. That is in table above it would remove group 2 because there are values a,a,b but not group c where is a,c,a. So I would get the table bellow?
Variable 1 Variable 2
1 a
1 b
3 a
3 c
3 a
... ...
To test for consecutive identical values, you can compare a value to the previous value in that column. In dplyr, this is possible with lag. (You could do the same thing with comparing to the next value, using lead. Result comes out the same.)
Group the data by variable1, get the lag of variable2, then add up how many of these duplicates there are in that group. Then filter for just the groups with no duplicates. After that, feel free to remove the dupesInGroup column.
library(tidyverse)
df %>%
group_by(variable1) %>%
mutate(dupesInGroup = sum(variable2 == lag(variable2), na.rm = T)) %>%
filter(dupesInGroup == 0)
#> # A tibble: 5 x 3
#> # Groups: variable1 [2]
#> variable1 variable2 dupesInGroup
#> <int> <chr> <int>
#> 1 1 a 0
#> 2 1 b 0
#> 3 3 a 0
#> 4 3 c 0
#> 5 3 a 0
Created on 2018-05-10 by the reprex package (v0.2.0).
prepare data frame:
df <- data.frame("Variable 1" = c(1, 1, 2, 2, 2, 3, 3, 3), "Variable 2" = unlist(strsplit("abaabaca", "")))
write functions to test if consecutive repetitions are there or not:
any.consecutive.p <- function(v) {
for (i in 1:(length(v) - 1)) {
if (v[i] == v[i + 1]) {
return(TRUE)
}
}
return(FALSE)
}
any.consecutive.in.col.p <- function(df, col) {
any.consecutive.p(df[, col])
}
any.consecutive.p returns TRUE if it finds first consecutive repetition in a vector (v).
any.consecutive.in.col.p() looks for consecutive repetitions in a column of a data frame.
split data frame by values of Variable.1
df.l <- split(df, df$Variable.1)
df.l
$`1`
Variable.1 Variable.2
1 1 a
2 1 b
$`2`
Variable.1 Variable.2
3 2 a
4 2 a
5 2 b
$`3`
Variable.1 Variable.2
6 3 a
7 3 c
8 3 a
Finally go over this data.frame list and test for each data frame, if it contains consecutive duplicates in Variable.2 column.
If found, don't collect it.
Bind the collected data frames by rows.
Reduce(rbind, lapply(df.l, function(df) if(!any.consecutive.in.col.p(df, "Variable.2")) {df}))
Variable.1 Variable.2
1 1 a
2 1 b
6 3 a
7 3 c
8 3 a
Say you want to remove all groups of df, grouped by a, where the column b has repeated values. You can do that as below.
set.seed(0)
df <- data.frame(a = rep(1:3, rep(3, 3)), b = sample(1:5, 9, T))
# dplyr
library(dplyr)
df %>%
group_by(a) %>%
filter(all(b != lag(b), na.rm = T))
#data.table
library(data.table)
setDT(df)
df[, if(all(b != shift(b), na.rm = T)) .SD, by = a]
Benchmark shows data.table is faster
#Results
# Unit: milliseconds
# expr min lq mean median uq max neval
# use_dplyr() 141.46819 165.03761 201.0975 179.48334 205.82301 539.5643 100
# use_DT() 36.27936 50.23011 64.9218 53.87114 66.73943 345.2863 100
# Method
set.seed(0)
df <- data.table(a = rep(1:2000, rep(1e3, 2000)), b = sample(1:1e3, 2e6, T))
use_dplyr <- function(x){
df %>%
group_by(a) %>%
filter(all(b != lag(b), na.rm = T))
}
use_DT <- function(x){
df[, if (all(b != shift(b), na.rm = T)) .SD, a]
}
microbenchmark(use_dplyr(), use_DT())

Factor levels reaching certain values

I need to find out how many factor levels reach values of a continuous variable.
The code below produces the desired result for the example data, but it is rather an awkward work around.
My real dataframe is much larger and the real plot should show more values (or is continuous) on the x-axis. I would appreciate an applicable code a lot.
set.seed(5)
df <- data.frame(ID = factor(c("a","a","b","c","d","e","e")),values = runif(7,0,6))
seq <- 1:5
length.unique <- function(x) length(unique(x))
sub1 <- df[which(df$values >= 1), ]
sub2 <- df[which(df$values >= 2), ]
sub3 <- df[which(df$values >= 3), ]
sub4 <- df[which(df$values >= 4), ]
sub5 <- df[which(df$values >= 5), ]
N_IDs <- c(length.unique(sub1$ID),length.unique(sub2$ID),length.unique(sub3$ID),length.unique(sub4$ID),length.unique(sub5$ID))
plot(N_IDs ~ seq, type="b")
Using tidyverse, you can save some time by first calculating the max value for each ID,
library(tidyverse)
idmax <- df %>% group_by(ID) %>% summarize(max=max(values)) %>% pull(max)
Then for each cut point, return the count that pass
map_df(1:5, ~data.frame(cut=., count=sum(idmax >.)))
# cut count
# 1 1 4
# 2 2 3
# 3 3 3
# 4 4 3
# 5 5 1
Using non-equi joins:
library(data.table)
setDT(df)
df[.(seq = 1:5), on = .(values >= seq), allow = T, .(N_IDs = uniqueN(ID)), by = .EACHI]
# values N_IDs
#1: 1 4
#2: 2 3
#3: 3 3
#4: 4 3
#5: 5 1

Resources