I have some data in the format below, where all columns are of type chr.
#> # A tibble: 3 x 4
#> id age name income
#> <chr> <chr> <chr> <chr>
#> 1 1 18 jim 100
#> 2 2 21 bob 200
#> 3 3 16 alice 300
I'd like to use as.numeric() on only some columns. Preferably, I'd like to define a vector of column names and then use purrr:map to map as.numeric() to only those columns:
numeric_variables <- c("id", "age", "income")
How can I map that?
My desired output would look like:
df
#> # A tibble: 3 x 4
#> id age name income
#> <dbl> <dbl> <chr> <dbl>
#> 1 1 18 jim 100
#> 2 2 21 bob 200
#> 3 3 16 alice 300
Code for data entry below.
library(purrr)
df <- data.frame(stringsAsFactors=FALSE,
id = c(1, 2, 3),
age = c(18, 21, 16),
name = c("jim", "bob", "alice"),
income = c(100, 200, 300)
)
df <- map_df(df, as.character)
df
Created on 2020-02-15 by the reprex package (v0.3.0)
We can use mutate_at
library(dplyr)
df %>%
mutate_at(vars(numeric_variables), as.numeric) %>%
as_tibble
# A tibble: 3 x 4
# id age name income
# <dbl> <dbl> <chr> <dbl>
#1 1 18 jim 100
#2 2 21 bob 200
#3 3 16 alice 300
Or more easily
df %>%
type.convert(as.is = TRUE)
Or with map
library(purrr)
df %>%
map_if(names(.) %in% numeric_variables, as.numeric) %>%
bind_cols
# A tibble: 3 x 4
# id age name income
# <dbl> <dbl> <chr> <dbl>
#1 1 18 jim 100
#2 2 21 bob 200
#3 3 16 alice 300
Or if we use the compound assignment operator (%<>%), this can be assigned in place
library(magrittr)
df %<>%
map_if(names(.) %in% numeric_variables, as.numeric) %<>%
bind_cols
str(df)
#tibble [3 × 4] (S3: tbl_df/tbl/data.frame)
# $ id : num [1:3] 1 2 3
# $ age : num [1:3] 18 21 16
# $ name : chr [1:3] "jim" "bob" "alice"
# $ income: num [1:3] 100 200 300
You can use map_at
df[] <- purrr::map_at(df, numeric_variables, as.numeric)
df
# A tibble: 3 x 4
# id age name income
# <dbl> <dbl> <chr> <dbl>
#1 1 18 jim 100
#2 2 21 bob 200
#3 3 16 alice 300
Related
I am struggling to convert a nested tabular and space delimited character string to wide data. My data looks something like this:
#Sample data
df <- tibble(id = 1:3,
text_table = c(NA,
"P1\tP2\tL1\n
M1\t15.09\t10.45\n
M2\t3040\t1959\n
M3\t0\t660\n",
NA))
df
# A tibble: 3 × 2
id text_table
<int> <chr>
1 1 NA
2 2 "P1\tP2\tL1\n\n M1\t15.09\t10.45\n\n M2\t3040\t1959\n\n …
3 3 NA
Using read_table from readr package I can convert a single line to a text string, but I would like to apply this to entire column (text_table) and then convert the data to none nested wide data.
library(readr)
read_table(df$text_table[2])
# A tibble: 3 × 3
P1 P2 L1
<chr> <dbl> <dbl>
1 M1 15.1 10.4
2 M2 3040 1959
3 M3 0 660
My desired output would look something like:
# A tibble: 3 × 7
id P2_M1 P2_M2 P2_M3 L1_M1 L1_M2 L1_M3
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 NA NA NA NA NA NA
2 2 15.1 3040 0 10.4 1959 660
3 3 NA NA NA NA NA NA
I don´t know how to apply the function read_tableto all the column elements. Using apply and lapply have not worked for me.
I’d first write a helper function to transform a single table to the format
you are after:
library(tidyverse)
df <- tibble(
id = 1:3,
text_table = c(
NA,
"P1\tP2\tL1\n
M1\t15.09\t10.45\n
M2\t3040\t1959\n
M3\t0\t660\n",
NA
)
)
read_text_table <- function(x) {
if (is.na(x)) return(tibble())
read_table(I(x)) |>
pivot_wider(
names_from = P1,
values_from = c(P2, L1)
)
}
read_text_table(df$text_table[2])
#> # A tibble: 1 × 6
#> P2_M1 P2_M2 P2_M3 L1_M1 L1_M2 L1_M3
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 15.1 3040 0 10.4 1959 660
Because the function is going to be applied to each element in the column,
it needs to also gracefully handle an NA input. Here we just return an
empty tibble in that case.
Then use rowwise() to apply it to each element in the text_table column:
tables <- df |>
rowwise(id) |>
summarise(
read_text_table(text_table)
)
#> `summarise()` has grouped output by 'id'. You can override using the `.groups`
#> argument.
tables
#> # A tibble: 1 × 7
#> # Groups: id [1]
#> id P2_M1 P2_M2 P2_M3 L1_M1 L1_M2 L1_M3
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 2 15.1 3040 0 10.4 1959 660
Because the read_text_table() helper returns a 0-row data frame when there
is no table to read, we need a join to get the final result:
df |>
select(id) |>
left_join(tables, by = "id")
#> # A tibble: 3 × 7
#> id P2_M1 P2_M2 P2_M3 L1_M1 L1_M2 L1_M3
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 NA NA NA NA NA NA
#> 2 2 15.1 3040 0 10.4 1959 660
#> 3 3 NA NA NA NA NA NA
I'm using the sample dataset below:
mytable <- read.table(text=
"group team num ID
1 a x 1 9
2 a x 2 4
3 a y 3 5
4 a y 4 9
5 b x 1 7
6 b y 4 4
7 b x 3 9
8 b y 2 8",
header = TRUE, stringsAsFactors = FALSE)
I want to create separate data frames for each set of variables that I want to group by, I also want to group by two variables as well... I'm not sure how to do that. For example, I want a separate dataframe that groups the data by both team and ID as well... how do I do that?
library(dplyr)
lapply(c("group","team","ID",c("team","ID")), function(x){
group_by(mytable,across(c(x,num)))%>%summarise(Count = n()) %>% mutate(new=x)%>% as.data.frame()
})
See if this is what you want.
library(dplyr)
cols <- list("group","team","ID", c("team","ID"))
lapply(cols, function(x, dat = mytable){
dat2 <- dat %>%
group_by(across({{x}})) %>%
summarise(Count = n()) %>%
mutate(new = toString(x)) %>%
as.data.frame()
return(dat2)
})
# `summarise()` has grouped output by 'team'. You can override using the `.groups` argument.
# [[1]]
# group Count new
# 1 a 4 group
# 2 b 4 group
#
# [[2]]
# team Count new
# 1 x 4 team
# 2 y 4 team
#
# [[3]]
# ID Count new
# 1 4 2 ID
# 2 5 1 ID
# 3 7 1 ID
# 4 8 1 ID
# 5 9 3 ID
#
# [[4]]
# team ID Count new
# 1 x 4 1 team, ID
# 2 x 7 1 team, ID
# 3 x 9 2 team, ID
# 4 y 4 1 team, ID
# 5 y 5 1 team, ID
# 6 y 8 1 team, ID
# 7 y 9 1 team, ID
Does this, based on tidyverse, give you what you want?
library(tidyverse)
ytable %>%
group_by(team, ID) %>%
group_split()
<list_of<
tbl_df<
group: character
team : character
num : integer
ID : integer
>
>[7]>
[[1]]
# A tibble: 1 × 4
group team num ID
<chr> <chr> <int> <int>
1 a x 2 4
[[2]]
# A tibble: 1 × 4
group team num ID
<chr> <chr> <int> <int>
1 b x 1 7
[[3]]
# A tibble: 2 × 4
group team num ID
<chr> <chr> <int> <int>
1 a x 1 9
2 b x 3 9
[[4]]
# A tibble: 1 × 4
group team num ID
<chr> <chr> <int> <int>
1 b y 4 4
[[5]]
# A tibble: 1 × 4
group team num ID
<chr> <chr> <int> <int>
1 a y 3 5
[[6]]
# A tibble: 1 × 4
group team num ID
<chr> <chr> <int> <int>
1 b y 2 8
[[7]]
# A tibble: 1 × 4
group team num ID
<chr> <chr> <int> <int>
1 a y 4 9
I have a tibble with columns named as numbers (e.g. 1). I created a function to compute differences between columns, but I don't know how to do it with that type of columns:
<!-- language-all: lang-r -->
library(tidyverse)
df <- tibble(`1` = c(1,2,3), `2` = c(2,4,6))
# This works
df %>%
mutate(diff = `1` - `2`)
#> # A tibble: 3 x 3
#> `1` `2` diff
#> <dbl> <dbl> <dbl>
#> 1 1 2 -1
#> 2 2 4 -2
#> 3 3 6 -3
# But this doesn't
calc_diffs <- function(x, y){
df %>%
mutate(diff := !!x - !!y)
}
calc_diffs(1, 2)
#> # A tibble: 3 x 3
#> `1` `2` diff
#> <dbl> <dbl> <dbl>
#> 1 1 2 -1
#> 2 2 4 -1
#> 3 3 6 -1
<sup>Created on 2020-10-14 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup>
We can convert to a symbol and evaluate
calc_diffs <- function(x, y){
df %>%
mutate(diff := !! rlang::sym(x) - !!rlang::sym(y))
}
Then, we just pass a string as argument
calc_diffs("1", "2")
# A tibble: 3 x 3
# `1` `2` diff
# <dbl> <dbl> <dbl>
#1 1 2 -1
#2 2 4 -2
#3 3 6 -3
Column names are strings. We could pass index to subset the column, but here the column name is an unusual name that starts with number. So, either we can wrap it with backreference using paste or just pass a string, convert to symbol and evaluate (!!)
Does this work:
> df <- tibble(`1` = c(1,2,3), `2` = c(2,4,6))
> df
# A tibble: 3 x 2
`1` `2`
<dbl> <dbl>
1 1 2
2 2 4
3 3 6
> calc_diffs <- function(x, y){
+ df %>%
+ mutate(diff = {{x}} - {{y}})
+ }
> calc_diffs(`1`,`2`)
# A tibble: 3 x 3
`1` `2` diff
<dbl> <dbl> <dbl>
1 1 2 -1
2 2 4 -2
3 3 6 -3
>
I have a tibble with student test data, and I wish to convert these to percentiles using dplyr. For the sake of having a minimal example, imagine the following setup of three students.
require(tidyverse)
tbl <- tibble(Name = c("Alice", "Bob", "Cat"), Test = c(16, 13, 15))
The following code works and yields the desired output.
tbl %>% mutate(TestPercentile = cume_dist(Test) * 100)
# A tibble: 3 x 3
Name Test TestPercentile
<chr> <dbl> <dbl>
1 Alice 16 100
2 Bob 13 33.3
3 Cat 15 66.7
However, I actually want to do it programmatically because there are many such columns.
colname <- "Test"
percname <- str_c(colname, "Percentile")
tbl %>% mutate({{percname}} := cume_dist({{colname}}) * 100)
# A tibble: 3 x 3
Name Test TestPercentile
<chr> <dbl> <dbl>
1 Alice 16 100
2 Bob 13 100
3 Cat 15 100
Why does cume_dist make the percentile 100 for all students when I try to use tidy evaluation like this? (And ideally, if I can be permitted a second question, how can I fix it?)
If by programmatically you mean you want to write your own function, you can do it like this:
calculate_percentile <- function(data, colname) {
data %>%
mutate("{{colname}}Percentile" := cume_dist({{colname}} * 100))
}
tbl %>%
calculate_percentile(Test)
# A tibble: 3 x 3
Name Test TestPercentile
<chr> <dbl> <dbl>
1 Alice 16 1
2 Bob 13 0.333
3 Cat 15 0.667
Edit for multiple columns
New Data
tbl <- tibble(Name = c("Alice", "Bob", "Cat"), Test = c(16, 13, 15), Test_math = c(16, 30, 55), Test_music = c(3, 78, 34))
calculate_percentile <- function(data, colnames) {
data %>%
mutate(across({{colnames}}, ~cume_dist(.) * 100, .names = "{col}Percentile"))
}
test_columns <- c("Test_math", "Test_music")
tbl %>%
calculate_percentile(test_columns)
# A tibble: 3 x 6
Name Test Test_math Test_music Test_mathPercentile Test_musicPercentile
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Alice 16 16 3 33.3 33.3
2 Bob 13 30 78 66.7 100
3 Cat 15 55 34 100 66.7
Why does your solution not work? Because your solution applies cume_dist literally to the string "test":
tbl %>% mutate({{percname}} := print({{colname}}))
[1] "Test"
# A tibble: 3 x 5
Name Test Test_math Test_music TestPercentile
<chr> <dbl> <dbl> <dbl> <chr>
1 Alice 16 16 3 Test
2 Bob 13 30 78 Test
3 Cat 15 55 34 Test
Why does this give a TestPercentile value of 100? Because cume_dist of "test" is 1:
cume_dist("test")
#[1] 1
So we need R to tell not to evaluate the string "test" per se but to look for a variable with this name, which we can do like this:
tbl %>% mutate({{percname}} := cume_dist(!!parse_quo(colname, env = global_env())) * 100)
# A tibble: 3 x 5
Name Test Test_math Test_music TestPercentile
<chr> <dbl> <dbl> <dbl> <dbl>
1 Alice 16 16 3 100
2 Bob 13 30 78 33.3
3 Cat 15 55 34 66.7
#Check that this uses the values of "Test" and not "Test" per se:
tbl %>% mutate({{percname}} := print(!!parse_quo(colname, env = global_env())))
[1] 16 13 15
# A tibble: 3 x 5
Name Test Test_math Test_music TestPercentile
<chr> <dbl> <dbl> <dbl> <dbl>
1 Alice 16 16 3 16
2 Bob 13 30 78 13
3 Cat 15 55 34 15
Passing column name as string :
library(dplyr)
library(rlang)
return_percentile <- function(data, colname) {
percname <- paste0(colname, "Percentile")
data %>% mutate({{percname}} := cume_dist(!!sym(colname)) * 100)
}
tbl %>% return_percentile("Test")
# A tibble: 3 x 3
# Name Test TestPercentile
# <chr> <dbl> <dbl>
#1 Alice 16 100
#2 Bob 13 33.3
#3 Cat 15 66.7
Passing column name unquoted :
return_percentile <- function(data, colname) {
percname <- paste0(deparse(substitute(colname)), "Percentile")
data %>% mutate({{percname}} := cume_dist({{colname}}) * 100)
}
tbl %>% return_percentile(Test)
# A tibble: 3 x 3
# Name Test TestPercentile
# <chr> <dbl> <dbl>
#1 Alice 16 100
#2 Bob 13 33.3
#3 Cat 15 66.7
Consider the following:
df <- data.frame(
Name = c("Alan", "Bob", "Christine", "David", "Erica"),
Gender = c("M", "M", "F", "M", "F"),
Star_Sign = c("Aquarius", "Capricorn", "Aquarius", "Libra", "Leo"),
City = c("London", "Paris", "Berlin", "London", "Paris"),
Blood_Group = c("A", "AB", "B", "O", "A"),
Hours_Worked = c(2000, 1600, 0, 100, 200),
Salary = c(100000, 20000, 0, 500, 4000)
)
Name_Summary <- df %>% group_by(Name) %>% summarise(Hours_Worked = sum(Hours_Worked), Average_Salary = mean(Salary))
Gender_Summary <- df %>% group_by(Gender) %>% summarise(Hours_Worked = sum(Hours_Worked), Average_Salary = mean(Salary))
Star_Sign_Summary <- df %>% group_by(Star_Sign) %>% summarise(Hours_Worked = sum(Hours_Worked), Average_Salary = mean(Salary))
City_Summary <- df %>% group_by(City) %>% summarise(Hours_Worked = sum(Hours_Worked), Average_Salary = mean(Salary))
Blood_Group_Summary <- df %>% group_by(Blood_Group) %>% summarise(Hours_Worked = sum(Hours_Worked), Average_Salary = mean(Salary))
Obviously this works fine for a small number of fields. If, however, I've got 100 different fields (say) to do this for, it becomes very unwieldy.
I'd like to think that there is a way to loop through the list of fields and produce these summaries for each field, using some code to generate (and name the summaries), but I don't think I know how to do this. Can anyone help please?
Thanks
Alan
If you have a list of the columns you want to group by as a character vector:
vars_to_group_by <- names(df)[1:5]
You could iterate over them (I'm using purrr::map() but you could use lapply() or a loop), and use this rlang pattern to convert strings >> symbols >> properly evaluated variables.
library(tidyverse)
map(vars_to_group_by, sym) %>%
map(~ df %>%
group_by(!!.x) %>%
summarise(avg_salary = mean(Salary),
avg_hours = mean(Hours_Worked),
avg_hourly_wage = avg_salary / avg_hours))
You get an unnamed list back, because the vector going in was unnamed.
[[1]]
# A tibble: 5 x 4
Name avg_salary avg_hours avg_hourly_wage
<fct> <dbl> <dbl> <dbl>
1 Alan 100000 2000 50
2 Bob 20000 1600 12.5
3 Christine 0 0 NaN
4 David 500 100 5
5 Erica 4000 200 20
[[2]]
# A tibble: 2 x 4
Gender avg_salary avg_hours avg_hourly_wage
<fct> <dbl> <dbl> <dbl>
1 F 2000 100 20
2 M 40167. 1233. 32.6
[[3]]
# A tibble: 4 x 4
Star_Sign avg_salary avg_hours avg_hourly_wage
<fct> <dbl> <dbl> <dbl>
1 Aquarius 50000 1000 50
2 Capricorn 20000 1600 12.5
3 Leo 4000 200 20
4 Libra 500 100 5
[[4]]
# A tibble: 3 x 4
City avg_salary avg_hours avg_hourly_wage
<fct> <dbl> <dbl> <dbl>
1 Berlin 0 0 NaN
2 London 50250 1050 47.9
3 Paris 12000 900 13.3
[[5]]
# A tibble: 4 x 4
Blood_Group avg_salary avg_hours avg_hourly_wage
<fct> <dbl> <dbl> <dbl>
1 A 52000 1100 47.3
2 AB 20000 1600 12.5
3 B 0 0 NaN
4 O 500 100 5
You could add names based on vars_to_group_by either before or after the map() calls.
We could use the group_by_at which can take a string as input
library(purrr)
library(dplyr)
map(names(df)[-6], ~ df %>%
group_by_at(.x) %>%
summarise(avg_salary = mean(Salary)))
#[[1]]
# A tibble: 5 x 2
# Name avg_salary
# <fct> <dbl>
#1 Alan 100000
#2 Bob 20000
#3 Christine 0
#4 David 500
#5 Erica 4000
#[[2]]
# A tibble: 2 x 2
# Gender avg_salary
# <fct> <dbl>
#1 F 2000
#2 M 40167.
#[[3]]
# A tibble: 4 x 2
# Star_Sign avg_salary
# <fct> <dbl>
#1 Aquarius 50000
#2 Capricorn 20000
#3 Leo 4000
#4 Libra 500
#[[4]]
# A tibble: 3 x 2
# City avg_salary
# <fct> <dbl>
#1 Berlin 0
#2 London 50250
#3 Paris 12000
#[[5]]
# A tibble: 4 x 2
# Blood_Group avg_salary
# <fct> <dbl>
#1 A 52000
#2 AB 20000
#3 B 0
#4 O 500