Renaming columns in a dataframe based on a vector - r

I was able to do this, but was wondering if there was a more elegant way, possibly with dplyr rename?
# Create dataframe with three named columns
tb <- tibble(col1 = 1:3, col2 = 1:3, col3 = 1:3)
#> # A tibble: 3 x 3
#> col1 col2 col3
#> <int> <int> <int>
#> 1 1 1 1
#> 2 2 2 2
#> 3 3 3 3
# Named vector with replacement names
new_names <- c(col1 = "Column 1", col3 = "Col3")
#> col1 col3
#> "Column 1" "Col3"
# Rename columns within dataframe
tb <- new_names[colnames(tb)] %>%
coalesce(colnames(tb)) %>%
setNames(object = tb, nm = .)
#> # A tibble: 3 x 3
#> `Column 1` col2 Col3
#> <int> <int> <int>
#> 1 1 1 1
#> 2 2 2 2
#> 3 3 3 3

# loading dplyr
pacman::p_load(dplyr)
# rename() syntax demands:
# LHS - a new column name
# RHS - an existing column name
# can be either a named vector or a named list
c('Column 1' = 'col1', 'Col3' = 'col3') -> x
# the unquote-splice (!!!) operator unquotes and splices its argument
rename(tibble(col1 = 1:3, col2 = 1:3, col3 = 1:3), !!!x)
#> # A tibble: 3 x 3
#> `Column 1` col2 Col3
#> <int> <int> <int>
#> 1 1 1 1
#> 2 2 2 2
#> 3 3 3 3
You can find more about it here:
a good book
And here: pretty documentation
Pipe operators are kinda slow so you ought to try to avoid using them when not needed.

Related

How to use case_when to apply different functions in dplyr

What I am thinking might be naive. But I want to split the rows [1:3] of df based on the second "_", using tidyr::extract()
library(tidyr)
library(dplyr)
extract(col1, into = c("col1", "col2"), "^(.*?_.*?)_(.*)$")
and the rows of df [4:6] based on the first "_"
extract(col1, into = c("col1", "col2"), "^(.*?)_(.*)$")
I am thinking of something like
df %>%
mutate(n=row_number())
mutate(col2=case_when
(n<=3 ~ extract(col1, into = c("col1", "col2"), "^(.*?_.*?)_(.*)$"),
n>3 ~ extract(col1, into = c("col1", "col2"), "^(.*?)_(.*)$")
)
Of course, this is screamingly wrong but is it possible in some way?
Example data:
df=tibble(col1 = c("2397_A_run379_CTTGTACT_S119_L004_R1_001",
"3779_A_run535_TTATAGCC_S91_L003_R1_001",
"4958_BV_run685_GCGTACGT_S89_L005_R1_001",
"5126AA_S27_L004_R1_001",
"5126AF_S32_L004_R1_001",
"5126AL_S38_L004_R1_001"))
df
#> # A tibble: 6 × 1
#> col1
#> <chr>
#> 1 2397_A_run379_CTTGTACT_S119_L004_R1_001
#> 2 3779_A_run535_TTATAGCC_S91_L003_R1_001
#> 3 4958_BV_run685_GCGTACGT_S89_L005_R1_001
#> 4 5126AA_S27_L004_R1_001
#> 5 5126AF_S32_L004_R1_001
#> 6 5126AL_S38_L004_R1_001
Created on 2022-11-17 with reprex v2.0.2
If the pattern is to extract the substring by matching the _ the precedes one or more letters followed by digits,
library(dplyr)
library(stringr)
df %>%
mutate(col2 = str_extract(col1, "(?<=_)[A-Za-z]+\\d+.*"))
-output
# A tibble: 6 × 2
col1 col2
<chr> <chr>
1 2397_A_run379_CTTGTACT_S119_L004_R1_001 run379_CTTGTACT_S119_L004_R1_001
2 3779_A_run535_TTATAGCC_S91_L003_R1_001 run535_TTATAGCC_S91_L003_R1_001
3 4958_BV_run685_GCGTACGT_S89_L005_R1_001 run685_GCGTACGT_S89_L005_R1_001
4 5126AA_S27_L004_R1_001 S27_L004_R1_001
5 5126AF_S32_L004_R1_001 S32_L004_R1_001
6 5126AL_S38_L004_R1_001 S38_L004_R1_001
Or use separate
library(tidyr)
separate(df, col1, into = c("col1", "col2"),
sep = "(?<=[A-Z])_(?=[A-Za-z]+\\d+)", extra = "merge")
-output
# A tibble: 6 × 2
col1 col2
<chr> <chr>
1 2397_A run379_CTTGTACT_S119_L004_R1_001
2 3779_A run535_TTATAGCC_S91_L003_R1_001
3 4958_BV run685_GCGTACGT_S89_L005_R1_001
4 5126AA S27_L004_R1_001
5 5126AF S32_L004_R1_001
6 5126AL S38_L004_R1_001
tidyr::extract() takes and returns a dataframe, and will be tricky to use inside mutate(). I would instead use something like stringr::str_match():
library(dplyr)
library(stringr)
df %>%
mutate(
row = row_number(),
col2 = case_when(
row < 4 ~ str_match(col1, ".+?_.+?_(.+)")[, 2],
row < 7 ~ str_match(col1, ".+?_(.+)")[, 2]
)
)
# A tibble: 6 × 3
col1 row col2
<chr> <int> <chr>
1 2397_A_run379_CTTGTACT_S119_L004_R1_001 1 run379_CTTGTACT_S119_L004_R1_001
2 3779_A_run535_TTATAGCC_S91_L003_R1_001 2 run535_TTATAGCC_S91_L003_R1_001
3 4958_BV_run685_GCGTACGT_S89_L005_R1_001 3 run685_GCGTACGT_S89_L005_R1_001
4 5126AA_S27_L004_R1_001 4 S27_L004_R1_001
5 5126AF_S32_L004_R1_001 5 S32_L004_R1_001
6 5126AL_S38_L004_R1_001 6 S38_L004_R1_001

How to rename column as dataframe name in list?

Hello so I was able to rename a list of dataframes using,
names(Final_mixed_list)[1:12] <- sprintf("genome_%d", 1:12)
each dataframe has its own name as genome_1, genome_2 and so on
Now I was trying to rename the second column "Names" in each of the dataframes of my lists as their dataframe name "genome_1 instead of Names for the genome_1 list and genome_" for the Genome_2 list and so on.
Each of the dataframes have the same column names "COG" and "Names" and the data and number of rows vary but they all have the same two columns
I tried using lapply along with colnames but instead i got my dataframe deleted using the following
final_mixed_list2 <- lapply(seq_along(Final_mixed_list), function(i) {colnames(Final_mixed_list[[i]])[2] <- sprintf("genome_%d", 1:12)})
and I got a new list where all of my values were deleted and only genome1 to 12 values were present
My expected result is to keep the dataframes intact like before but to change the Names column with their respective dataframe name genome_1 , genome_2 and so on.
I like imap for something like this.
library(tidyverse)
#example
df_list <- list(genome_1 = tibble(Name = "genome_1", val = 9),
genome_2 = tibble(Name = "genome_2", val = 10),
genome_3 = tibble(Name = "genome_3", val = 15))
df_list
#> $genome_1
#> # A tibble: 1 x 2
#> Name val
#> <chr> <dbl>
#> 1 genome_1 9
#>
#> $genome_2
#> # A tibble: 1 x 2
#> Name val
#> <chr> <dbl>
#> 1 genome_2 10
#>
#> $genome_3
#> # A tibble: 1 x 2
#> Name val
#> <chr> <dbl>
#> 1 genome_3 15
#solution
df_list |>
imap(\(dat, genome) rename(dat, "{genome}" := Name))
#> $genome_1
#> # A tibble: 1 x 2
#> genome_1 val
#> <chr> <dbl>
#> 1 genome_1 9
#>
#> $genome_2
#> # A tibble: 1 x 2
#> genome_2 val
#> <chr> <dbl>
#> 1 genome_2 10
#>
#> $genome_3
#> # A tibble: 1 x 2
#> genome_3 val
#> <chr> <dbl>
#> 1 genome_3 15

How to keep other values unchanged with dplyr's recode_factor

In the example below, recoding some values makes all the other NA. How can I keep the other values unchanged?
library(tibble)
library(dplyr)
test <- tibble(
test_vec = as.factor(c(1, 2, 3))
)
test
#> # A tibble: 3 x 1
#> test_vec
#> <fct>
#> 1 1
#> 2 2
#> 3 3
test %>%
mutate(test_vec = recode_factor(test_vec, `3` = 4))
#> # A tibble: 3 x 1
#> test_vec
#> <fct>
#> 1 <NA>
#> 2 <NA>
#> 3 4
Need to make your replacement the same type as the original value.
test %>%
mutate(test_vec = recode_factor(test_vec, "3" = "4"))
# A tibble: 3 x 1
test_vec
<fct>
1 1
2 2
3 4
Using fct_recode
library(forcats)
library(dplyr)
test %>%
mutate(test_vec = fct_recode(test_vec, `4` = '3'))
-output
# A tibble: 3 x 1
# test_vec
# <fct>
#1 1
#2 2
#3 4
So that you don't get missing NA values, you have to list the other values in the function as well.
test %>%
mutate(test_vec = recode_factor(test_vec, `1` = 1, `2` = 2, `3` = 4))
Result
# A tibble: 3 x 1
test_vec
<fct>
1 1
2 2
3 4
Another way to do it is using case_when, but for this you have to start from numerical values.
I give you an example starting from numerical values and I convert them to factor.
test <- tibble(
test_vec = (c(1, 2, 3)))
test %>%
mutate(test_vec = case_when( test_vec != 3 ~ test_vec,
test_vec == 3 ~ 4)) %>%
mutate(across(test_vec,factor))
Result
# A tibble: 3 x 1
test_vec
<fct>
1 1
2 2
3 4

Renaming all dataframe columns with stringr and dplyr

I am trying to rename all columns in my data frame using dplyr and stringr, but it seems not to be working the way I desire. How should I change the following code to get the output I want (shown in the code below)?
Here is the fully reproducible code:
library(dplyr)
library(stringr)
library(tibble)
library(rlang)
# dataframe
x <-
tibble::as.tibble(cbind(
Grace_neu_wrong = c(1:4),
Grace_acc_wrong = c(1:4),
Grace_att_wrong = c(1:4),
Grace_int_wrong = c(1:4)
))
# defining custom function to rename the entire dataframe in a certain way
string_conversion <- function(df, ...) {
# preparing the dataframe
df <- dplyr::select(.data = df,
!!rlang::quo(...))
# custom function to split the name of each column in a certain way
splitfn <- function(x) {
x <- as.character(x)
split <- stringr::str_split(string = x, pattern = "_")[[1]]
paste(split[2], split[3], '_', split[1], sep = '')
}
# applying the splitfn function to each column name and outputting the data frame
df_new <- df %>%
dplyr::select_all(.funs = colnames) %>%
dplyr::mutate_all(.funs = splitfn)
return(df_new)
}
# the output I get
string_conversion(df = x, names(x))
#> # A tibble: 4 x 4
#> Grace_neu_wrong Grace_acc_wrong Grace_att_wrong Grace_int_wrong
#> <chr> <chr> <chr> <chr>
#> 1 NANA_1 NANA_1 NANA_1 NANA_1
#> 2 NANA_1 NANA_1 NANA_1 NANA_1
#> 3 NANA_1 NANA_1 NANA_1 NANA_1
#> 4 NANA_1 NANA_1 NANA_1 NANA_1
# the output I desire
tibble::as.tibble(cbind(
neuwrong_Grace = c(1:4),
accwrong_Grace = c(1:4),
attwrong_Grace = c(1:4),
intwrong_Grace = c(1:4)
))
#> # A tibble: 4 x 4
#> neuwrong_Grace accwrong_Grace attwrong_Grace intwrong_Grace
#> <int> <int> <int> <int>
#> 1 1 1 1 1
#> 2 2 2 2 2
#> 3 3 3 3 3
#> 4 4 4 4 4
Created on 2018-02-08 by the reprex
package (v0.1.1.9000).
You can do this in a single line without using mutate, which should be for the column values rather than the column names. Instead, do the following using stringr::str_replace and regular expressions.
The pattern "(.*)_(.*)_(.*)" is three groups of characters separated by underscores.
We simply make the replacement "\\2\\3_\\1", which is group 2, then group 3, then an underscore, then group 1, giving us the desired result.
The code is consequently just one line long:
names(x) <- str_replace(names(x), "(.*)_(.*)_(.*)", "\\2\\3_\\1")
print(x)
# A tibble: 4 x 4
neuwrong_Grace accwrong_Grace attwrong_Grace intwrong_Grace
<int> <int> <int> <int>
1 1 1 1 1
2 2 2 2 2
3 3 3 3 3
4 4 4 4 4

Mass changing columns of a data set to numeric

I've imported an excel data set and want to set nearly all columns (greater than 90) to numeric when they are initially characters. What is the best way to achieve this because importing and changing each to numeric one by one isn't the most efficient approach?
This should do as you wish:
# Random data frame for illustration (100 columns wide)
df <- data.frame(replicate(100,sample(0:1,1000,rep=TRUE)))
# Check column names / return column number (just encase you wanted to check)
colnames(df)
# Specify columns
cols <- c(1:length(df)) # length(df) is useful as if you ever add more columns at later date
# Or if only want to specify specific column numbers:
# cols <- c(1:100)
#With help of magrittr pipe function change all to numeric
library(magrittr)
df[,cols] %<>% lapply(function(x) as.numeric(as.character(x)))
# Check our columns are numeric
str(df)
Assuming your data is already imported with all character columns, you can convert the relevant columns to numeric using mutate_at by position or name:
suppressPackageStartupMessages(library(tidyverse))
# Assume the imported excel file has 5 columns a to e
df <- tibble(a = as.character(1:3),
b = as.character(5:7),
c = as.character(8:10),
d = as.character(2:4),
e = as.character(2:4))
# select the columns by position (convert all except 'b')
df %>% mutate_at(c(1, 3:5), as.numeric)
#> # A tibble: 3 x 5
#> a b c d e
#> <dbl> <chr> <dbl> <dbl> <dbl>
#> 1 1 5 8 2 2
#> 2 2 6 9 3 3
#> 3 3 7 10 4 4
# or drop the columns that shouldn't be used ('b' and 'd' should stay as chr)
df %>% mutate_at(-c(2, 4), as.numeric)
#> # A tibble: 3 x 5
#> a b c d e
#> <dbl> <chr> <dbl> <chr> <dbl>
#> 1 1 5 8 2 2
#> 2 2 6 9 3 3
#> 3 3 7 10 4 4
# select the columns by name
df %>% mutate_at(c("a", "c", "d", "e"), as.numeric)
#> # A tibble: 3 x 5
#> a b c d e
#> <dbl> <chr> <dbl> <dbl> <dbl>
#> 1 1 5 8 2 2
#> 2 2 6 9 3 3
#> 3 3 7 10 4 4

Resources