Generate same random sample each time in loop using sample_frac - r

How can I generate same random sample each time in loop using sample_frac
library(dplyr)
tbl = tibble(val = 1:50)
for (i in 1:3)
{
tbl_sample = tbl %>% sample_frac(0.1)
print(tbl_sample)
}

Create a list and then assign it to the list element by looping over the sequence - print only prints the output to the console
n <- 3
lst1 <- vector('list', n)
for(i in seq_len(n)) {
lst1[[i]] <- tbl %>%
sample_frac(0.1)
}
If we want to get 5 random indices
for(i in seq_len(n)) {
lst1[[i]] <- tbl %>%
mutate(rn = row_number()) %>%
slice_sample(prop = 0.1)
}
In addition this can be automatically done with replicate
lst1 <- replicate(n, tbl %>%
sample_frac(0.1), simplify = FALSE)

We can use purrr, and loop through 1:3 with the same expression as the .fn argument:
library(purrr)
library(dplyr)
map(1:3, ~{
tbl_sample = tbl %>% sample_frac(0.1)
head(tbl_sample)
})
[[1]]
# A tibble: 5 x 1
val
<int>
1 1
2 20
3 35
4 32
5 19
[[2]]
# A tibble: 5 x 1
val
<int>
1 45
2 24
3 42
4 46
5 10
[[3]]
# A tibble: 5 x 1
val
<int>
1 35
2 23
3 28
4 49
5 43
You may want every sample in a single column in a dataframe. For that we may use imap_dfc:
imap_dfc(1:3, ~{
tbl %>% sample_frac(0.1) %>% head %>% set_names(paste0('sample_', .y))
})
# A tibble: 5 x 3
sample_1 sample_2 sample_3
<int> <int> <int>
1 49 17 8
2 29 9 6
3 25 35 50
4 9 4 44
5 34 45 3
>

Related

Binding rows based on common id

I have a very simple case where I want to combine several data frames into one based on a common id elements of a particular data frame.
Example:
id <- c(1, 2, 3)
x <- c(10, 12, 14)
data1 <- data.frame(id, x)
id <- c(2, 3)
x <- c(20, 22)
data2 <- data.frame(id, x)
id <- c(1, 3)
x <- c(30, 32)
data3 <- data.frame(id, x)
Which gives us,
$data1
id x
1 1 10
2 2 12
3 3 14
$data2
id x
1 2 20
2 3 22
$data3
id x
1 1 30
2 3 32
Now, I want to combine all three data frames based on the id's of the data3. The expected output should look like
> comb
id x
1 1 10
2 1 NA
3 1 30
4 3 14
5 3 22
6 3 32
I am trying the following, but not getting the expected output.
library(dplyr)
library(tidyr)
combined <- bind_rows(data1, data2, data3, .id = "id") %>% arrange(id)
Any idea how to get the expected output?
Does this work:
library(dplyr)
library(tidyr)
data1 %>% full_join(data2, by = 'id') %>% full_join(data3, by = 'id') %>% arrange(id) %>% right_join(data3, by = 'id') %>%
pivot_longer(cols = -id) %>% select(-name) %>% distinct()
# A tibble: 6 x 2
id value
<dbl> <dbl>
1 1 10
2 1 NA
3 1 30
4 3 14
5 3 22
6 3 32
Combine the 3 dataframes in one list and use filter to select only the id's in 3rd dataframe.
library(dplyr)
library(tidyr)
bind_rows(data1, data2, data3, .id = "new_id") %>%
filter(id %in% id[new_id == 3]) %>%
complete(new_id, id)
# new_id id x
# <chr> <dbl> <dbl>
#1 1 1 10
#2 1 3 14
#3 2 1 NA
#4 2 3 22
#5 3 1 30
#6 3 3 32
A pure base R solution can also make it
lst <- list(data1, data2, data3)
reshape(
subset(
reshape(
do.call(rbind, Map(cbind, lst, grp = seq_along(lst))),
idvar = "id",
timevar = "grp",
direction = "wide"
),
id %in% lst[[3]]$id
),
idvar = "id",
varying = -1,
direction = "long"
)[c("id", "x")]
which gives
id x
1.1 1 10
3.1 3 14
1.2 1 NA
3.2 3 22
1.3 1 30
3.3 3 32
>
Using base R
do.call(rbind, unname(lapply(mget(ls(pattern = "^data\\d+$")), \(x) {
x1 <- subset(x, id %in% data3$id)
v1 <- setdiff(data3$id, x1$id)
if(length(v1) > 0) rbind(x1, cbind(id = v1, x = NA)) else x1
})))
-output
id x
1 1 10
3 3 14
2 3 22
11 1 NA
12 1 30
21 3 32
bind_rows(data1, data2, data3, .id = 'grp')%>%
complete(id, grp)%>%
select(-grp) %>%
filter(id%in%data3$id)
# A tibble: 6 x 2
id x
<dbl> <dbl>
1 1 10
2 1 NA
3 1 30
4 3 14
5 3 22
6 3 32

Counting all unique strings in a data frame containing strings and numeric values

I have a number of large data frames which has the occasional string value and I would like to know what the unique string values are (ignoring the numeric values) and if possible count these strings.
df <- data.frame(1:16)
df$A <- c("Name",0,0,0,0,0,12,12,0,14,NA_real_,14,NA_real_,NA_real_,16,16)
df$B <- c(10,0,"test",0,12,12,12,12,0,14,NA_real_,14,16,16,16,16)
df$C <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16)
X1.16 A B C
1 1 Name 10 10
2 2 0 0 12
3 3 0 test 14
4 4 0 0 16
5 5 0 12 10
6 6 0 12 12
7 7 12 12 14
8 8 12 12 16
9 9 0 0 10
10 10 14 14 12
11 11 <NA> <NA> 14
12 12 14 14 16
13 13 <NA> 16 10
14 14 <NA> 16 12
15 15 16 16 14
16 16 16 16 16
I know I can use the count function in dplyr but I have too many unique numeric values so this is not a great solution. In the code below I was able to filter my data so to only retain rows that contain an alphabetical character (although this isn't a solution either).
df %>% filter_all(any_vars(str_detect(., pattern = "[:alpha:]")))
X1.16 A B C
1 1 Name 10 10
2 3 0 test 14
My desired output would be something to the effect of:
Variable n
"Name" 1
"test" 1
You can get the string value with grep and count them using table :
stack(table(grep('[[:alpha:]]', unlist(df), value = TRUE)))[2:1]
If you want a tidyverse answer you can get the data in long format, keep only the rows with characters in it and count them.
library(dplyr)
df %>%
mutate(across(.fns = as.character)) %>%
tidyr::pivot_longer(cols = everything()) %>%
filter(grepl('[[:alpha:]]', value)) %>%
count(value)
# value n
# <chr> <int>
#1 Name 1
#2 test 1
#Ronak and #akrun above beat me to the punch, my solution is very similar - with an extension if you want a count within columns
# Coerce to tibble for ease of reading
df <- df %>%
as_tibble() %>%
mutate(across(.fns = as.character))
df %>%
pivot_longer(cols = everything()) %>%
summarise(Variable = str_subset(value, "[:alpha:]")) %>%
count(Variable, sort = TRUE)
# A tibble: 2 x 2
Variable n
<chr> <int>
1 Name 1
2 test 1
# str_subset is a convenient wrapper around filter & str_detect
Add some extra words to test
# Test on extra word counts - replace 12 and 14 with words
df2 <- df
df2[df2 == 12] <- 'Name'
df2[df2 == 14] <- 'test'
df2
df2 %>%
pivot_longer(cols = everything()) %>%
summarise(Variable = str_subset(value, "[:alpha:]")) %>%
count(Variable, sort = TRUE)
# A tibble: 2 x 2
Variable n
<chr> <int>
1 Name 12
2 test 10
If you want counts by column
df2 %>%
select(-1) %>%
pivot_longer(everything(), names_to = 'col') %>%
group_by(col) %>%
summarise(Variable = str_subset(value, "[:alpha:]")) %>%
count(col, Variable)
# A tibble: 6 x 3
# Groups: col [3]
col Variable n
<chr> <chr> <int>
1 A Name 3
2 A test 2
3 B Name 4
4 B test 3
5 C Name 4
6 C test 4
We can use filter with across
library(dplyr)
library(tidyr)
library(stringr)
library(purrr)
df %>%
select(-1) %>%
mutate(across(everything(), as.character)) %>%
filter(across(everything(), ~ str_detect(., '[:alpha:]')) %>% reduce(`|`)) %>%
pivot_longer(everything()) %>%
filter(str_detect(value, '[:alpha:]')) %>%
count(value)
# A tibble: 2 x 2
# value n
# <chr> <int>
#1 Name 1
#2 test 1

Can I substitute conditions into an dplyr filter?

I'm trying to pass in a conditional (<,>, etc) and a value into a function to be evaluated in a dplyr filter.
Other answers on SO suggest I should simply be able to use parse, but my sample function (without using the args) fails with:
Error in (...) & ~parse(text = "obj_200 > 0") : operations are possible only for numeric, logical or complex types
Sample function:
filterOrder = function( df, cond, value) {
df = df %>%
rownames_to_column('date') %>%
filter( ...
parse(text="obj_200 > 0"),
...
)
...
return(df)
}
We can use parse_expr from rlang
filterOrder = function( df, cond, value) {
df %>%
rownames_to_column('date') %>%
filter(!!(rlang::parse_expr("obj_200 > 0")))
}
A reproducible example with iris
data(iris)
iris %>%
filter(!!(rlang::parse_expr('Species == "setosa"')))
if you want to get rid of the string-conditions, you could do something like that
library(magrittr)
df <- dplyr::tibble(A=1:10,
B=11:20,
C=10:1)
foo <- function(df, .cond){
.cond <- rlang::enquo(.cond)
res <- df %>%
dplyr::filter(!!.cond)
return(res)
}
foo(df,.cond=A>5)
# A tibble: 5 x 3
A B C
<int> <int> <int>
1 6 16 5
2 7 17 4
3 8 18 3
4 9 19 2
5 10 20 1
foo(df,.cond=A==1)
# A tibble: 1 x 3
A B C
<int> <int> <int>
1 1 11 10
foo(df,.cond=A>3 & C<4)
# A tibble: 3 x 3
A B C
<int> <int> <int>
1 8 18 3
2 9 19 2
3 10 20 1

Use Negation with Select in dplyr 0.7.x

I'm trying to write a function that needs to exclude a user passed variable from the resultant data frame. I'm also taking this opportunity to learn a bit more about the new dplyr syntax.
The function acts like a cross join for data frames. I want to use it as a clean way of duplicating data across parameters of a function.
The function works as follows:
crossjoin_df <- function(df1, df2, temp_col = ".k") {
df1 <- df1 %>%
mutate(!!temp_col := 1)
df2 <- df2 %>%
mutate(!!temp_col := 1)
out <- left_join(df1, df2, by = temp_col)
# I'm trying to replace the next line
out[,!names(out)==temp_col]
}
params <- data.frame(k = c(11,10),
n = c(27,26))
data <- data.frame(a = 1:3,
b = 4:6)
crossjoin_df(params, data) # 6 row data set
I want to see if it's possible to replace the last statement with a piped select statement. However, the negation does not seem to be working.
I am able to get something like:
out %>% select(!!temp_col)
to work, but that obviously only selects .k. I am not able to get anything like:
out %>% select(-!!temp_col)
to work.
You'll need rlang, the backend package for dplyr that enables tidy eval, whether you want to keep using strings, in which case you'll need sym to turn a string into a quosure:
library(dplyr)
params <- data.frame(k = c(11,10),
n = c(27,26))
data <- data.frame(a = 1:3,
b = 4:6)
crossjoin_df <- function(df1, df2, temp_col = ".k") {
df1 <- df1 %>% mutate(!!temp_col := 1)
df2 <- df2 %>% mutate(!!temp_col := 1)
left_join(df1, df2, by = temp_col) %>%
select(-!!rlang::sym(temp_col))
}
crossjoin_df(params, data)
#> k n a b
#> 1 11 27 1 4
#> 2 11 27 2 5
#> 3 11 27 3 6
#> 4 10 26 1 4
#> 5 10 26 2 5
#> 6 10 26 3 6
...or switch to full tidy eval, in which case you'll need quo_name to turn a quosure into a name:
crossjoin_df <- function(df1, df2, temp_col = .k) {
temp_col <- enquo(temp_col)
df1 <- df1 %>% mutate(!!rlang::quo_name(temp_col) := 1)
df2 <- df2 %>% mutate(!!rlang::quo_name(temp_col) := 1)
left_join(df1, df2, by = rlang::quo_name(temp_col)) %>%
select(-!!temp_col)
}
crossjoin_df(params, data)
#> k n a b
#> 1 11 27 1 4
#> 2 11 27 2 5
#> 3 11 27 3 6
#> 4 10 26 1 4
#> 5 10 26 2 5
#> 6 10 26 3 6
Alternatively, just use tidyr::crossing:
tidyr::crossing(params, data)
#> k n a b
#> 1 11 27 1 4
#> 2 11 27 2 5
#> 3 11 27 3 6
#> 4 10 26 1 4
#> 5 10 26 2 5
#> 6 10 26 3 6
You can use one_of, and then negate the selection with -:
out %>% select(-one_of(temp_col))
crossjoin_df <- function(df1, df2, temp_col = ".k") {
# `$`(df1, temp_col) <- 1
df1 <- df1 %>%
mutate(!!temp_col := 1)
# `$`(df2, temp_col) <- 1
df2 <- df2 %>%
mutate(!!temp_col := 1)
left_join(df1, df2, by = temp_col) %>% select(-one_of(temp_col))
}
params <- data.frame(k = c(11,10),
n = c(27,26))
data <- data.frame(a = 1:3,
b = 4:6)
crossjoin_df(params, data)
# k n a b
#1 11 27 1 4
#2 11 27 2 5
#3 11 27 3 6
#4 10 26 1 4
#5 10 26 2 5
#6 10 26 3 6
This should work as well:
out %>% select_(paste0("-",temp_col))

truncate dataset when variable falls below threshold

I wish to find the day on which variable hb falls below 90 for each record. I can find the day when x=min
f <- function(x) 1:length(x) <= which.min(x)
ind <- as.logical(ave(df$hb, df1$ id, FUN=f))
dfhb <- (df [ind, ])
maxday <- dfhb %>% group_by(id) %>% summarise(daymax = last(day))
However, I can’t get hb<90
f2 <- function(x) 1:length(x) <= which(x<=90)
ind <- as.logical(ave(df$hb, df$id, FUN=f2))
dfhb <- (df [ind, ])
maxday <- dfhb %>% group_by(id) %>% summarise(daymax = last(day))
summary(maxday$daymax)
I would be very grateful for your advice, bw Annemarie
id day hb
1 1 95
1 2 NA
1 3 91
1 4 89
2 1 98
2 2 87
2 3 84
3 1 89
3 2 92
3 3 89
Here is the idea for obtaining the first observed value that satisfies your threshold,
df %>%
group_by(id) %>%
summarise(daymax = which(hb<90)[1])
# A tibble: 3 × 2
# id daymax
# <int> <int>
#1 1 4
#2 2 2
#3 3 1

Resources