can anybody explain the following difference:
library(tidyverse)
tribble(~id,
c(1:10))%>%
unnest_longer(id)%>%
mutate(data = map(.x = id, ~mtcars))%>%
unnest_longer(data)
gives:
# A tibble: 320 x 2
id data$mpg $cyl $disp $hp $drat $wt $qsec $vs $am $gear $carb
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
2 1 21 6 160 110 3.9 2.88 17.0 0 1 4 4
3 1 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
4 1 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
5 1 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
whereas
library(tidyverse)
tribble(~id,
c(1:10))%>%
unnest_longer(id)%>%
mutate(data = map(.x = id, ~mtcars))%>%
unnest(data)
gives the result I want.
# A tibble: 320 x 12
id mpg cyl disp hp drat wt qsec vs am gear carb
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
2 1 21 6 160 110 3.9 2.88 17.0 0 1 4 4
3 1 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
4 1 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
5 1 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
Why are there $-Signs in the 1st example of code?
Thanks in advance!
This is a feature introduced in {tibble} version 2.0.1 - docs.
To get the same results as tidyr::unnest(), you'll need to add tidyr::unpack().
tribble(
~id,
c(1:10)
) %>%
tidyr::unnest_longer(id) %>%
dplyr::mutate(data = purrr::map(
.x = id,
~ mtcars
)) %>%
tidyr::unnest_longer(data) %>%
tidyr::unpack(cols = data)
Related
Given the tibble -
library(tidyverse)
df <- mtcars %>% as_tibble() %>% slice(1:5)
df
# A tibble: 32 x 11
mpg cyl disp hp drat wt qsec vs am gear carb
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
I know you can use something like df %>% select(c(mpg:vs)) to select a range of columns without typing all the column names in the select statement. How do I do something similar in a case_when statement. I have a dataset with about 35 columns and want to flag rows where all columns are equal to 0.
We can use where condition in select
df %>%
select(where(~ all(. %in% c(0, 1))))
-output
# A tibble: 5 x 2
vs am
<dbl> <dbl>
1 0 1
2 0 1
3 1 1
4 1 0
5 0 0
If we want to create a new column 'flag' that checks if all the column values are 0 for a particular row
df %>%
mutate(new = !rowSums(cur_data() != 0))
I'm not sure if we can use case_when about this problem, but we can use the following solution:
library(dplyr)
df %>%
rowwise() %>%
mutate(flag = +(all(c_across(where(is.numeric)) == 0)))
# A tibble: 5 x 12
# Rowwise:
mpg cyl disp hp drat wt qsec vs am gear carb flag
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 21 6 160 110 3.9 2.62 16.5 0 1 4 4 0
2 21 6 160 110 3.9 2.88 17.0 0 1 4 4 0
3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1 0
4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1 0
5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2 0
I am trying to write a custom function that uses rlang's non-standard evaluation to group a dataframe by more than one variable.
This is what I've-
library(rlang)
# function definition
tryfn <- function(data, groups, ...) {
# preparing data
df <- dplyr::group_by(data, !!!rlang::enquos(groups))
print(head(df))
# applying some function `.f` on df that absorbs `...`
# .f(df, ...)
}
This works with a single grouping variable-
# works
tryfn(mtcars, am)
#> # A tibble: 6 x 11
#> # Groups: am [2]
#> mpg cyl disp hp drat wt qsec vs am gear carb
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
#> 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
#> 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
#> 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
#> 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
#> 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1
But if try to use more than one grouping variable, this doesn't work-
# doesn't work
tryfn(mtcars, c(am, cyl))
#> Error: Column `c(am, cyl)` must be length 32 (the number of rows) or one, not 64
# doesn't work
tryfn(mtcars, list(am, cyl))
#> Error: Column `list(am, cyl)` must be length 32 (the number of rows) or one, not 2
We could parse as an expression with enexpr and use !!!
tryfn <- function(data, groups, ...) {
groups <- as.list(rlang::enexpr(groups))
groups <- if(length(groups) > 1) groups[-1] else groups
group_by(data, !!!groups)
}
-testing
tryfn(mtcars, am)
# A tibble: 32 x 11
# Groups: am [2]
# mpg cyl disp hp drat wt qsec vs am gear carb
# * <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
# 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
# 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
# 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
# 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
# 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1
# 7 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4
# 8 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
# 9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
#10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4
# … with 22 more rows
tryfn(mtcars, c(am, cyl))
# A tibble: 32 x 11
# Groups: am, cyl [6]
# mpg cyl disp hp drat wt qsec vs am gear carb
# * <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
# 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
# 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
# 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
# 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
# 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1
# 7 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4
# 8 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
# 9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
#10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4
# … with 22 more rows
I would like to take a feature and spread it's values as columns with 1/0 if true/false e.g.
mtcars %>%
pivot_wider(names_from = cyl,
values_from = 1)
This appears to have done something, cyl has now been spread into columns except the values are things like 21, 21.4 or NA.
> mtcars %>%
+ pivot_wider(names_from = cyl,
+ values_from = 1)
# A tibble: 32 x 12
disp hp drat wt qsec vs am gear carb `6` `4` `8`
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 160 110 3.9 2.62 16.5 0 1 4 4 21 NA NA
2 160 110 3.9 2.88 17.0 0 1 4 4 21 NA NA
3 108 93 3.85 2.32 18.6 1 1 4 1 NA 22.8 NA
4 258 110 3.08 3.22 19.4 1 0 3 1 21.4 NA NA
5 360 175 3.15 3.44 17.0 0 0 3 2 NA NA 18.7
6 225 105 2.76 3.46 20.2 1 0 3 1 18.1 NA NA
7 360 245 3.21 3.57 15.8 0 0 3 4 NA NA 14.3
8 147. 62 3.69 3.19 20 1 0 4 2 NA 24.4 NA
9 141. 95 3.92 3.15 22.9 1 0 4 2 NA 22.8 NA
10 168. 123 3.92 3.44 18.3 1 0 4 4 19.2 NA NA
I tried using values_fill like so:
> mtcars %>%
+ pivot_wider(names_from = cyl,
+ values_from = 1,
+ values_fill = list(1 = 0))
Error: unexpected '=' in:
" values_from = 1,
values_fill = list(1 ="
How can I spread cyl across columns with binary 1 or 0 values depending on whether or not the cyl is 4, 6 or 8?
Is pivot_wider() what I want?
Set mpg to 1 and set the fill for mpg to 0 like this:
mtcars %>%
mutate(mpg = 1) %>%
pivot_wider(names_from = cyl, values_from = mpg, values_fill = list(mpg = 0))
## # A tibble: 32 x 12
## disp hp drat wt qsec vs am gear carb `6` `4` `8`
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 160 110 3.9 2.62 16.5 0 1 4 4 1 0 0
## 2 160 110 3.9 2.88 17.0 0 1 4 4 1 0 0
## 3 108 93 3.85 2.32 18.6 1 1 4 1 0 1 0
## ... etc ...
or given the problem that pivot_wider currently has with ordering the columns you may prefer the older spread:
mtcars %>%
mutate(mpg = 1) %>%
spread(cyl, mpg, fill = 0)
## disp hp drat wt qsec vs am gear carb 4 6 8
## 1 71.1 65 4.22 1.835 19.90 1 1 4 1 1 0 0
## 2 75.7 52 4.93 1.615 18.52 1 1 4 2 1 0 0
## 3 78.7 66 4.08 2.200 19.47 1 1 4 1 1 0 0
## ... etc ...
Alternately specify values_fn like this:
mtcars %>%
pivot_wider(names_from = cyl, values_from = mpg,
values_fn = list(mpg = ~ 1), values_fill = list(mpg = 0))
## # A tibble: 32 x 12
## disp hp drat wt qsec vs am gear carb `6` `4` `8`
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 160 110 3.9 2.62 16.5 0 1 4 4 1 0 0
## 2 160 110 3.9 2.88 17.0 0 1 4 4 1 0 0
## 3 108 93 3.85 2.32 18.6 1 1 4 1 0 1 0
## ...etc...
An option would be to use the names and values from cyl and then recode this based on is.na:
mtcars %>%
pivot_wider(names_from = cyl,
values_from = cyl) %>%
mutate_at(vars(!!!syms(as.character(unique(mtcars$cyl)))), ~if_else(is.na(.), 0, 1))
# A tibble: 32 x 13
# mpg disp hp drat wt qsec vs am gear carb `6` `4` `8`
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 21 160 110 3.9 2.62 16.5 0 1 4 4 1 0 0
# 2 21 160 110 3.9 2.88 17.0 0 1 4 4 1 0 0
# 3 22.8 108 93 3.85 2.32 18.6 1 1 4 1 0 1 0
# 4 21.4 258 110 3.08 3.22 19.4 1 0 3 1 1 0 0
# 5 18.7 360 175 3.15 3.44 17.0 0 0 3 2 0 0 1
# 6 18.1 225 105 2.76 3.46 20.2 1 0 3 1 1 0 0
# 7 14.3 360 245 3.21 3.57 15.8 0 0 3 4 0 0 1
# 8 24.4 147. 62 3.69 3.19 20 1 0 4 2 0 1 0
# 9 22.8 141. 95 3.92 3.15 22.9 1 0 4 2 0 1 0
#10 19.2 168. 123 3.92 3.44 18.3 1 0 4 4 1 0 0
I have a number of dataframes and a series of changes I want to make to each of them. For this example, let to the desired change be simply making each data frame a tibble using as_tibble. I know there are various ways of doing this, but I'd like to do this using purrr:walk.
For data frames df1 and df2,
df1 <- mtcars
df2 <- mtcars
I'd like to do the equivalent of
df1 %<>% as_tibble
df2 %<>% as_tibble
using walk. My attempt:
library(tidyverse)
walk(c(df1, df2), ~ assign(deparse(substitute(.)), as_tibble(.)))
This runs but does not make the desired change:
is_tibble(df1)
#> [1] FALSE
Here is how you can combine assign with walk (see the comments the code for more explanation)-
library(tidyverse)
# data
df1 <- mtcars
df2 <- mtcars
# creating tibbles
# this creates a list of objects with names ("df1", "df2")
tibble::lst(df1, df2) %>%
purrr::walk2(
.x = names(.), # names to assign
.y = ., # object to be assigned
.f = ~ assign(x = .x,
value = tibble::as.tibble(.y),
envir = .GlobalEnv)
)
# checking the newly created tibbles
df1
#> # A tibble: 32 x 11
#> mpg cyl disp hp drat wt qsec vs am gear carb
#> * <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
#> 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
#> 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
#> 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
#> 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
#> 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1
#> 7 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4
#> 8 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
#> 9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
#> 10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4
#> # ... with 22 more rows
df2
#> # A tibble: 32 x 11
#> mpg cyl disp hp drat wt qsec vs am gear carb
#> * <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
#> 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
#> 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
#> 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
#> 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
#> 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1
#> 7 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4
#> 8 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
#> 9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
#> 10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4
#> # ... with 22 more rows
Created on 2018-11-13 by the reprex package (v0.2.1)
Is there an easy way to filter my data frame so that any rows after and including a row that follows some condition are filtered out? The issue here is that I want it to be robust enough to handle a case where that condition is not met, in which the whole data frame will be returned. Check out my examples below if that sounds confusing:
library(dplyr)
## Works
mtcars %>%
as_tibble() %>%
filter(between(row_number(), 1, which(mpg == 17.8)))
#> # A tibble: 11 x 11
#> mpg cyl disp hp drat wt qsec vs am gear carb
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
#> 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
#> 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
#> 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
#> 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
#> 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1
#> 7 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4
#> 8 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
#> 9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
#> 10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4
#> 11 17.8 6 168. 123 3.92 3.44 18.9 1 0 4 4
## Doesn't work
mtcars %>%
as_tibble() %>%
filter(between(row_number(), 1, which(mpg == 30.5)))
#> Error in filter_impl(.data, quo): Evaluation error: Expecting a single value: [extent=0]..
Created on 2018-08-12 by the reprex package (v0.2.0).
You could include an ifelse statement to check whether the value is present in the dataframe. Also, you need to select the first row where the condition is verified to account for cases where the value is present more than once (in your example 21.0)
library(dplyr)
mtcars %>%
as_tibble() %>%
filter(between(row_number(), 1,ifelse(!any(mpg == 30),n(),which(mpg == 30)[1]-1)))
## returns the whole tibble
mtcars %>%
as_tibble() %>%
filter(between(row_number(), 1,ifelse(!any(mpg == 21),n(),which(mpg == 21)[1]-1)))
## Returns a tibble with 0 rows
mtcars %>%
as_tibble() %>%
filter(between(row_number(), 1,ifelse(!any(mpg == 21.4),n(),which(mpg == 21.4)[1]-1)))
## returns:
# A tibble: 3 x 11
mpg cyl disp hp drat wt qsec vs am gear carb
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
2 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
3 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
I think your specific example does not work because there is no mpg that equals 30.5, however, you get the same error with mpg equals 21.0 because there are two rows with that value. You will need to chose whether you want the first or the last instance of that condition
library(tidyverse)
#max row
mtcars %>%
as_tibble() %>%
filter(between(row_number(), 1, which(mtcars$mpg == 21.0)[length(which(mtcars$mpg == 21.0))]))
#> # A tibble: 2 x 11
#> mpg cyl disp hp drat wt qsec vs am gear carb
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
#> 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
or
#min row
mtcars %>%
as_tibble() %>%
filter(between(row_number(), 1, which(mtcars$mpg == 21.0)[1]))
#> # A tibble: 1 x 11
#> mpg cyl disp hp drat wt qsec vs am gear carb
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
The example I chose just happened to be rows 1 and 2, but it illustrates the idea.
EDIT
The other answer by Lamia is much more elegant, and I probably thought about this too hard, but I felt like I needed to come up with something
library(dplyr)
filter_if_condition <- function(.data, condition, yes){
test_cond <- enquo(condition)
yes_filter <- enquo(yes)
if(.data %>% filter(!!test_cond) %>% nrow() > 0){
.data %>% filter(!!yes_filter)
}
else{.data}
}
mtcars %>%
as_tibble() %>%
filter_if_condition(366.0 %in% mpg, between(row_number(), 1, which(mpg == 366)[1]))
#> # A tibble: 32 x 11
#> mpg cyl disp hp drat wt qsec vs am gear carb
#> * <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
#> 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
#> 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
#> 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
#> 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
#> 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1
#> 7 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4
#> 8 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
#> 9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
#> 10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4
#> # ... with 22 more rows
mtcars %>%
as_tibble() %>%
filter_if_condition(18.1 %in% mpg, between(row_number(), 1, which(mpg == 18.1)[1]))
#> # A tibble: 6 x 11
#> mpg cyl disp hp drat wt qsec vs am gear carb
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
#> 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
#> 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
#> 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
#> 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
#> 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1