Conditional replacement of column name in tibble using dplyr - r

I have the following tibble:
df <- structure(list(gene_symbol = c("0610005C13Rik", "0610007P14Rik",
"0610009B22Rik", "0610009L18Rik", "0610009O20Rik", "0610010B08Rik"
), foo.control.cv = c(1.16204038288333, 0.120508045270669, 0.205712615954009,
0.504508040948641, 0.333956330117591, 0.543693011377001), foo.control.mean = c(2.66407458486012,
187.137728870855, 142.111269303428, 16.7278587043453, 69.8602872478098,
4.77769028710622), foo.treated.cv = c(0.905769898934564, 0.186441944401973,
0.158552512842753, 0.551955061149896, 0.15743983656006, 0.290447431974039
), foo.treated.mean = c(2.40658723367692, 180.846795140269, 139.054032348287,
11.8584348984435, 76.8141734599118, 2.24088124240385)), .Names = c("gene_symbol",
"foo.control.cv", "foo.control.mean", "foo.treated.cv", "foo.treated.mean"
), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
6L))
Which looks like this:
# A tibble: 6 × 5
gene_symbol foo.control.cv foo.control.mean foo.treated.cv foo.treated.mean
* <chr> <dbl> <dbl> <dbl> <dbl>
1 0610005C13Rik 1.1620404 2.664075 0.9057699 2.406587
2 0610007P14Rik 0.1205080 187.137729 0.1864419 180.846795
3 0610009B22Rik 0.2057126 142.111269 0.1585525 139.054032
4 0610009L18Rik 0.5045080 16.727859 0.5519551 11.858435
5 0610009O20Rik 0.3339563 69.860287 0.1574398 76.814173
6 0610010B08Rik 0.5436930 4.777690 0.2904474 2.240881
What I want to do is to replace all column names with mean in it into mean_expr. Resulting in
gene_symbol foo.control.cv foo.control.mean_expr foo.treated.cv foo.treated.mean_expr
1 0610005C13Rik 1.1620404 2.664075 0.9057699 2.406587
2 0610007P14Rik 0.1205080 187.137729 0.1864419 180.846795
3 0610009B22Rik 0.2057126 142.111269 0.1585525 139.054032
4 0610009L18Rik 0.5045080 16.727859 0.5519551 11.858435
5 0610009O20Rik 0.3339563 69.860287 0.1574398 76.814173
6 0610010B08Rik 0.5436930 4.777690 0.2904474 2.240881
How can I achieve that?

With current versions of dplyr, you can use rename_at:
library(dplyr)
df %>% rename_at(vars(contains('mean')), funs(sub('mean', 'mean_expr', .)))
#> # A tibble: 6 × 5
#> gene_symbol foo.control.cv foo.control.mean_expr foo.treated.cv
#> * <chr> <dbl> <dbl> <dbl>
#> 1 0610005C13Rik 1.1620404 2.664075 0.9057699
#> 2 0610007P14Rik 0.1205080 187.137729 0.1864419
#> 3 0610009B22Rik 0.2057126 142.111269 0.1585525
#> 4 0610009L18Rik 0.5045080 16.727859 0.5519551
#> 5 0610009O20Rik 0.3339563 69.860287 0.1574398
#> 6 0610010B08Rik 0.5436930 4.777690 0.2904474
#> # ... with 1 more variables: foo.treated.mean_expr <dbl>
Really, you could use rename_all, as well, as names that don't match would be unaffected anyway. Further, you can use a quosure or anything that can be coerced to a function by rlang::as_function for .funs, so you can use purrr-style notation:
df %>% rename_all(~sub('mean', 'mean_expr', .x))
Since a data frame is a list, purrr's set_names can do the same thing:
library(purrr) # or library(tidyverse)
df %>% set_names(~sub('mean', 'mean_expr', .x))
All return the same thing.

Another option is to paste in rename_at (using the devel version of dplyr)
library(dplyr)
df %>%
rename_at(vars(matches('mean')), funs(sprintf('%s_expr', .)))
# A tibble: 6 × 5
# gene_symbol foo.control.cv foo.control.mean_expr foo.treated.cv foo.treated.mean_expr
#* <chr> <dbl> <dbl> <dbl> <dbl>
#1 0610005C13Rik 1.1620404 2.664075 0.9057699 2.406587
#2 0610007P14Rik 0.1205080 187.137729 0.1864419 180.846795
#3 0610009B22Rik 0.2057126 142.111269 0.1585525 139.054032
#4 0610009L18Rik 0.5045080 16.727859 0.5519551 11.858435
#5 0610009O20Rik 0.3339563 69.860287 0.1574398 76.814173
#6 0610010B08Rik 0.5436930 4.777690 0.2904474 2.240881
Or using rename_if
df %>%
rename_if(grepl("mean", names(.)), funs(sprintf("%s_expr", .)))

Here is a non-dplyr base R method:
names(df) <- sub("mean$", "mean_expr", names(df))
# or names(df) <- sub("mean", "mean_expr", names(df)) if the mean doesn't have to be at the
# end of the string
names(df)
#[1] "gene_symbol" "foo.control.cv" "foo.control.mean_expr"
#[4] "foo.treated.cv" "foo.treated.mean_expr"
If you want it to be a part of the pipe, you can make use of setNames function:
df %>% setNames(sub("mean", "mean_expr", names(.))) %>% names(.)
#[1] "gene_symbol" "foo.control.cv" "foo.control.mean_expr"
#[4] "foo.treated.cv" "foo.treated.mean_expr"

Another option is dplyr::select_all():
df %>% select_all(~gsub("mean", "mean_expr", .))

And with the use of magritrr you can have
library(magrittr)
names(df)[df %>% names %>% grep(pattern = "mean")] %<>% paste0("_expr")
df
# A tibble: 6 x 5
gene_symbol foo.control.cv foo.control.mean_expr foo.treated.cv foo.treated.mean_expr
* <chr> <dbl> <dbl> <dbl> <dbl>
1 0610005C13Rik 1.16 2.66 0.906 2.41
2 0610007P14Rik 0.121 187. 0.186 181.
3 0610009B22Rik 0.206 142. 0.159 139.
4 0610009L18Rik 0.505 16.7 0.552 11.9
5 0610009O20Rik 0.334 69.9 0.157 76.8
6 0610010B08Rik 0.544 4.78 0.290 2.24

Related

In R , there are `actual` and `budget` values,how to add new variable and calculate the variable values

In variable type ,there are actual and budget values,how to add new variable and calculate the variable value ? Current code can work, but a little bording. Anyone can help? Thanks!
ori_data <- data.frame(
category=c("A","A","A","B","B","B"),
year=c(2021,2022,2022,2021,2022,2022),
type=c("actual","actual","budget","actual","actual","budget"),
sales=c(100,120,130,70,80,90),
profit=c(3.7,5.52,5.33,2.73,3.92,3.69)
)
Add sales inc%
ori_data$sales_inc_or_budget_acheved[category=='A'&year=='2022'&type=='actual'] <-
ori_data$sales[category=='A'&year=='2022'&type=='actual']/
ori_data$sales[category=='A'&year=='2021'&type=='actual']-1
Add budget acheved%
ori_data$sales_inc_or_budget_acheved[category=='A'&year=='2022'&type=='budget'] <-
ori_data$sales[category=='A'&year=='2022'&type=='actual']/
ori_data$sales[category=='A'&year=='2022'&type=='budget']
Using a group_by and an if_elseyou could do:
library(dplyr)
ori_data |>
group_by(category) |>
arrange(category, type, year) |>
mutate(sales_inc_or_budget_achieved = if_else(type == "actual",
sales / lag(sales) - 1,
lag(sales) / sales)) |>
ungroup()
#> # A tibble: 6 × 6
#> category year type sales profit sales_inc_or_budget_achieved
#> <chr> <dbl> <chr> <dbl> <dbl> <dbl>
#> 1 A 2021 actual 100 3.7 NA
#> 2 A 2022 actual 120 5.52 0.2
#> 3 A 2022 budget 130 5.33 0.923
#> 4 B 2021 actual 70 2.73 NA
#> 5 B 2022 actual 80 3.92 0.143
#> 6 B 2022 budget 90 3.69 0.889
And using across you could do the same for both sales and profit:
ori_data |>
group_by(category) |>
arrange(category, type, year) |>
mutate(across(c(sales, profit), ~ if_else(type == "actual",
.x / lag(.x) - 1,
lag(.x) / .x),
.names = "{.col}_inc_or_budget_achieved")) |>
ungroup()
#> # A tibble: 6 × 7
#> category year type sales profit sales_inc_or_budget_achie… profit_inc_or_b…
#> <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 A 2021 actual 100 3.7 NA NA
#> 2 A 2022 actual 120 5.52 0.2 0.492
#> 3 A 2022 budget 130 5.33 0.923 1.04
#> 4 B 2021 actual 70 2.73 NA NA
#> 5 B 2022 actual 80 3.92 0.143 0.436
#> 6 B 2022 budget 90 3.69 0.889 1.06
Answer from stefan suits perfectly well, however, I would suggest you rearrange your data first.
In my opinion sales and profit are types of measures (aka observations) and actual and budget are the measurements here:
library(tidyr)
library(dplyr)
ori_data2 <-
ori_data %>%
pivot_longer(c(sales, profit)) %>%
pivot_wider(names_from = type, values_from = value) %>%
group_by(category, name) %>%
arrange(year, .by_group = TRUE)
then your calculations become much more easier:
ori_data2 %>%
mutate(increase = actual / lag(actual) - 1, # compare to the year before
budget_acheved = actual / budget) %>% # compare actual vs. budget
filter(year == 2022) # you can filter for year of interest
mutate(across(c(increase, budget_acheved), scales::percent)) # and format as percent

Is there any function that give the changes between columns?

I have a df that looks like this.
head(dfhigh)
rownames 2015Y 2016Y 2017Y 2018Y 2019Y 2020Y 2021Y
1 Australia 29583.7403 48397.383 45220.323 68461.941 39218.044 20140.351 29773.188
2 Austria* 1294.5092 -8400.973 14926.164 5511.625 2912.795 -14962.963 5855.014
3 Belgium* -24013.3111 68177.596 -3057.153 27119.084 -9208.553 13881.481 22955.298
4 Canada 43852.7732 36061.859 22764.156 37653.521 50141.784 23174.006 59693.992
5 Chile* 20507.8407 12249.294 6128.716 7735.778 12499.238 8385.907 15251.538
6 Czech Republic 465.2137 9814.496 9517.948 11010.423 10108.914 9410.576 5805.084
I want to calculate the changes between years, so instead of the values, the table has the percentage of change (obviously deleting 2015Y).
Try this using (current - previous)/ previous *100
lst <- list()
nm <- names(dfhigh)[-1]
for(i in 1:(length(nm) - 1)){
lst[[i]] <- (dfhigh[[nm[i+1]]] - dfhigh[[nm[i]]]) / dfhigh[[nm[i]]] * 100
}
ans <- do.call(cbind , lst)
colnames(ans) <- paste("ch_of" , nm[-1])
ans
you can change the formula to calculate percentage as you want
You could also use a tidyverse solution.
library(tidyverse)
df %>%
pivot_longer(!rownames) %>%
group_by(rownames) %>%
mutate(value = 100*value/lag(value)-100) %>%
ungroup() %>%
pivot_wider(names_from = name, values_from = value)
# # A tibble: 6 × 8
# rownames `2015Y` `2016Y` `2017Y` `2018Y` `2019Y` `2020Y` `2021Y`
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 Australia NA 63.6 -6.56 51.4 -42.7 -48.6 47.8
# 2 Austria* NA -749. -278. -63.1 -47.2 -614. -139.
# 3 Belgium* NA -384. -104. -987. -134. -251. 65.4
# 4 Canada NA -17.8 -36.9 65.4 33.2 -53.8 158.
# 5 Chile* NA -40.3 -50.0 26.2 61.6 -32.9 81.9
# 6 CzechRepublic NA 2010. -3.02 15.7 -8.19 -6.91 -38.3

how can I make a new data frame where the columns are the unique values with corresponding observations from an old data frame? [duplicate]

This question already has answers here:
How to reshape data from long to wide format
(14 answers)
Closed 11 months ago.
My data frame has different dates as rows. Every unique date occurs appr. 500 times. I want to make a new data frame where every column is a unique date and where the rows are all the observations of that date from my old dataset. So for every column dat represents a certain date, I should have appr. 500 rows that each represent a rel_spread from that day.
You can use pivot_wider from tidyr:
library(tidyr)
pivot_wider(df, names_from = date, values_from = rel_spread, values_fn = list) %>%
unnest(everything())
#> # A tibble: 2 x 17
#> `20000103` `20000104` `20000105` `20000106` `20000107` `20000108` `20000109`
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 -0.0234 -0.0128 0.00729 0.0408 -0.0298 0.0398 0.0445
#> 2 0.0492 -0.0120 0.0277 0.0435 -0.0288 0.0152 -0.0374
#> # ... with 10 more variables: `20000110` <dbl>, `20000111` <dbl>,
#> # `20000112` <dbl>, `20000113` <dbl>, `20000114` <dbl>, `20000115` <dbl>,
#> # `20000116` <dbl>, `20000117` <dbl>, `20000118` <dbl>, `20000119` <dbl>
Note that we don't have your data (and I wasn't about to transcribe a picture of your data), but I created a little reproducible data set which should match the structure of your data set, except it only has two values per date for demo purposes:
set.seed(1)
df <- data.frame(date = rep(as.character(20000103:20000119), 2),
rel_spread = runif(34, -0.05, 0.05))
df
#> date rel_spread
#> 1 20000103 -0.0234491337
#> 2 20000104 -0.0127876100
#> 3 20000105 0.0072853363
#> 4 20000106 0.0408207790
#> 5 20000107 -0.0298318069
#> 6 20000108 0.0398389685
#> 7 20000109 0.0444675269
#> 8 20000110 0.0160797792
#> 9 20000111 0.0129114044
#> 10 20000112 -0.0438213730
#> 11 20000113 -0.0294025425
#> 12 20000114 -0.0323443247
#> 13 20000115 0.0187022847
#> 14 20000116 -0.0115896282
#> 15 20000117 0.0269841420
#> 16 20000118 -0.0002300758
#> 17 20000119 0.0217618508
#> 18 20000103 0.0491906095
#> 19 20000104 -0.0119964821
#> 20 20000105 0.0277445221
#> 21 20000106 0.0434705231
#> 22 20000107 -0.0287857479
#> 23 20000108 0.0151673766
#> 24 20000109 -0.0374444904
#> 25 20000110 -0.0232779331
#> 26 20000111 -0.0113885907
#> 27 20000112 -0.0486609667
#> 28 20000113 -0.0117612043
#> 29 20000114 0.0369690846
#> 30 20000115 -0.0159651003
#> 31 20000116 -0.0017919885
#> 32 20000117 0.0099565825
#> 33 20000118 -0.0006458693
#> 34 20000119 -0.0313782399
Allan’s answer is perfect if you have the same number of rows for each date. If this isn’t the case, the following should work:
library(tidyr)
library(dplyr)
data_wide <- data_long %>%
group_by(date) %>%
mutate(daterow = row_number()) %>%
ungroup() %>%
pivot_wider(names_from = date, values_from = rel_spread) %>%
select(!daterow)
data_wide
Output:
# A tibble: 6 x 4
`20000103` `20000104` `20000105` `20000106`
<dbl> <dbl> <dbl> <dbl>
1 -0.626 0.184 -0.836 -0.621
2 1.60 0.330 -0.820 -2.21
3 0.487 0.738 0.576 1.12
4 -0.305 1.51 0.390 -0.0449
5 NA NA NA -0.0162
6 NA NA NA 0.944
Example data:
set.seed(1)
data_long <- data.frame(
date = c(rep(20000103:20000105, 4), rep(20000106, 6)),
rel_spread = rnorm(18)
)

reshaping rows of data to two columns

We have data on school districts where the columns are the local-specific information (e.g., free and reduced price lunch %) and the corresponding statewide values.
dat <- tribble(
~state.poverty, ~state.EL, ~state.disability, ~state.frpl, ~local.poverty, ~local.frpl, ~local.disability, ~local.EL,
12.50592, 0.08342419, 0.12321831, 0.4495395, 25.23731, 0.6415712, 0.140739, 0.1469898)
dat
# A tibble: 1 x 8
state.poverty state.EL state.disability state.frpl local.poverty local.frpl local.disability local.EL
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 12.5 0.0834 0.123 0.450 25.2 0.642 0.141 0.147
We want to reshape that so that it looks like this.
demog state local
<chr> <dbl> <dbl>
1 poverty 12.5 25.2
2 EL 0.0834 0.147
3 disability 0.123 0.141
4 frpl 0.450 0.642
It seems like something that pivot_longer should be able to handle, but I haven't had much success so far. Any suggestions?
We can use pivot_longer
library(dplyr)
library(tidyr)
dat %>%
pivot_longer(cols = everything(),
names_to = c(".value", "demog"), names_sep = "\\.")
-output
# A tibble: 4 x 3
# demog state local
# <chr> <dbl> <dbl>
#1 poverty 12.5 25.2
#2 EL 0.0834 0.147
#3 disability 0.123 0.141
#4 frpl 0.450 0.642
A base R option using reshape
reshape(
dat,
direction = "long",
varying = 1:ncol(dat)
)
gives
# A tibble: 4 x 4
time state local id
<chr> <dbl> <dbl> <int>
1 poverty 12.5 25.2 1
2 EL 0.0834 0.642 1
3 disability 0.123 0.141 1
4 frpl 0.450 0.147 1

tidyselect::where() inconsistencies: where is where()?

Summary: You can do rename(A=1, B=2), can you do the same using rename_with()? my ~str_replace(... paste0()) works, I don't need to change that. But it only works for one variable at a time. Tidyselect suggests wrapping where(~str_replace...) but then complains it can't find it even though I can get where() to work in other instances.
I want to implement rename_with for more than one variable, but I get an error Error: Formula shorthand must be wrapped in where()`.
# Bad
data %>% select(~str_replace(., "Var_2_", paste0("Issue: Time")))
# Good
data %>% select(where(~str_replace(., "Var_2_", paste0("Issue: time"))))
Example original:
test%>% rename_with( ~str_replace(., "Var_2_", paste0("Issue: Time")), ~str_replace(., "Var_3_", paste0("Issue: Time")))
when I run
test%>% rename_with(where( ~str_replace(., "Var_2_", paste0("Issue: Time")), ~str_replace(., "Var_3_", paste0("Issue: Time"))))
and
test%>% rename_with( where(~str_replace(., "Var_2_", paste0("Issue: Time"))), where(~str_replace(., "Var_3_", paste0("Issue: Time"))))
I get
Error in where(~str_replace(., "Var_1_", paste0("Gov't surveillance: video wave")), : could not find function "where"
And I can't find it tabbing through tidyselect::
But I can run
test%>% select(where(is.numeric)) %>% map(sd, na.rm = TRUE)
without any issue so it does exist. What am I doing wrong?
Example data:
x <- c("_1_1",
"_1_2",
"_1_3",
"_2_1",
"_2_2",
"_2_3",
"_3_1",
"_3_2",
"_3_3",
"_4_3")
paste0("Var",x)
test <- t(as_tibble(rnorm(10, 5.5, .35)))
colnames(test) <- paste0("Var",x)
There is a switching of arguments in rename_with compared to rename_at. It is a bit unclear about the column names specified in the code and the data showed especially with the str_replace in both arguments. A typical use to replace the column names that starts with 'Var_2' with 'Issue: Time_2' would be
library(dplyr)
data <- data %>%
rename_with(~ str_replace(., 'Var_2', 'Issue: Time'),
starts_with('Var_2'))
-output
data
# A tibble: 1 x 10
# Var_1_1 Var_1_2 Var_1_3 `Issue: Time_1` `Issue: Time_2` `Issue: Time_3` Var_3_1 Var_3_2 Var_3_3 Var_4_3
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 5.68 5.18 5.34 5.38 5.47 5.82 5.93 5.35 5.20 5.62
If we need to change multiple column patterns, use matches
data %>%
rename_with(~ str_replace(., '(Var_2|Var_3)', '\\1_Issue: Time'),
matches('Var_2|Var_3'))
# A tibble: 1 x 10
# Var_1_1 Var_1_2 Var_1_3 `Var_2_Issue: Tim… `Var_2_Issue: Tim… `Var_2_Issue: Tim… `Var_3_Issue: Ti… `Var_3_Issue: Ti… `Var_3_Issue: Ti… Var_4_3
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 5.68 5.18 5.34 5.38 5.47 5.82 5.93 5.35 5.20 5.62
Or if we want to change corresponding replacement, pattern, use str_replace_all
data1 <- data %>%
set_names(str_replace_all(names(.), c("Var_1", "Var_2"), c("Issue 1 wave", "Issue 2 Wave")))
compare the output
data1
# A tibble: 1 x 10
`Issue 1 wave_1` Var_1_2 `Issue 1 wave_3` `Trust Wave_1` Var_2_2 `Issue 2 Wave_3` Var_3_1 Var_3_2 Var_3_3 Var_4_3
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 5.68 5.18 5.34 5.38 5.47 5.82 5.93 5.35 5.20 5.62
with original data
data
# A tibble: 1 x 10
Var_1_1 Var_1_2 Var_1_3 Var_2_1 Var_2_2 Var_2_3 Var_3_1 Var_3_2 Var_3_3 Var_4_3
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 5.68 5.18 5.34 5.38 5.47 5.82 5.93 5.35 5.20 5.62
where is generally used to check the column value i.e. suppose we want to select columns that are numeric type, use select(where(is.numeric)) and not on the column names. There are select_helpers to find the column names based on a substring i.e. starts_with, ends_with, contains, or pass a regex pattern in matches. An use case of where would be
data %>%
rename_with(~ str_replace(., 'Var_2', 'Issue: Time'), where(~ all(. > 5.5)))
# A tibble: 1 x 10
# Var_1_1 Var_1_2 Var_1_3 Var_2_1 Var_2_2 `Issue: Time_3` Var_3_1 Var_3_2 Var_3_3 Var_4_3
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 5.68 5.18 5.34 5.38 5.47 5.82 5.93 5.35 5.20 5.62
In the OP's code, select/map can be replaced with summarise/across
df %>%
summarise(across(where(is.numeric), sd))
data
data <- as_tibble(test)

Resources