Make unique strings in a data frame - r

I'd like to make a column of (possibly) non-unique strings into a column of unique strings.
For instance, consider:
df <- data.frame(
'Initials' = c("AA","AB","AB")
, 'Data' = c(1,2,3)
)
df
Initials Data
1 AA 1
2 AB 2
3 AB 3
I would like to obtain this:
Initials Data
1 AA 1
2 AB (1) 2
3 AB (2) 3
Note: I know I could use the rownames to uniquely identify the row, but I'd like to retain the string stored in the Initials column, with a number appended.

transform(df, Initials = ave(as.character(Initials), Initials,
FUN = function(x) if (length(x) > 1) paste0(x, " (", seq(x), ")") else x))
# Initials Data
# 1 AA 1
# 2 AB (1) 2
# 3 AB (2) 3

w <- ave(df$Data, df$Initials, FUN = seq_along )
> df$Initials <- paste(df$Initials, "(", w, ")", sep = "")
# > df
# Initials Data
# 1 AA(1) 1
# 2 AB(1) 2
# 3 AB(2) 3

TLDR
Can use make_unique from the makeunique package (Disclaimer: I'm the author)
# install.packages('makeunique')
library(makeunique)
df <- data.frame(
Initials = c("AA","AB","AB"),
Data = c(1,2,3)
)
df[['Initials']] <- make_unique(df[['Initials']])
df
#> Initials Data
#> 1 AA 1
#> 2 AB (1) 2
#> 3 AB (2) 3
Created on 2022-10-15 by the reprex package (v2.0.1)
Full Answer
Disclaimer: I'm the author the makeunique package
The problem with #Sven Hohenstein's answer is that it can still produce duplicates. For example if your input dataset happens to include a string 'AB (2)'. See example of issue below
df <- data.frame(
Initials = c("AA","AB","AB", "AB (2)"),
Data = c(1,2,3,4)
)
transform(df, Initials = ave(as.character(Initials), Initials,
FUN = function(x) if (length(x) > 1) paste0(x, " (", seq(x), ")") else x))
#> Initials Data
#> 1 AA 1
#> 2 AB (1) 2
#> 3 AB (2) 3
#> 4 AB (2) 4
Created on 2022-10-15 by the reprex package (v2.0.1)
Really you'd either want something like make_unique from the makeunique package that warns you when this happens and lets you change suffix formatting to fix the issue
# install.packages('makeunique')
library(makeunique)
df <- data.frame(
Initials = c("AA","AB","AB", "AB (2)"),
Data = c(1,2,3,4)
)
df[['Initials']] <- make_unique(df[['Initials']])
#> Error in make_unique(df[["Initials"]]): make_unique failed to make vector unique.
#> This is because appending ' <dup_number>' to duplicate values led tocreation of term(s) that were in the original dataset:
#> [AB (2)]
#>
#> Please try again with a different argument for either `wrap_in_brackets` or `sep`
# Change suffix format to fix problem
df[['Initials']] <- make_unique(df[['Initials']], sep = "-")
Created on 2022-10-15 by the reprex package (v2.0.1)
Or, you could make.unique from base R that ensures uniqueness but doesn't let you control how your suffixes look.
If you don't want to use a package, feel free to just copy the source function for make_unique into your code and use it like your own function
make_unique <- function(x, sep = " ", wrap_in_brackets = TRUE, warn_about_type_conversion = TRUE){
if(!(is.character(sep) & length(sep) == 1)) stop('`sep` must be a string, not a ', paste0(class(sep), collapse = " "))
if(!(is.logical(wrap_in_brackets) & length(wrap_in_brackets) == 1)) stop('`wrap_in_brackets` must be a flag, not a ', paste0(class(wrap_in_brackets), collapse = " "))
if(!(is.logical(warn_about_type_conversion) & length(warn_about_type_conversion) == 1)) stop('`warn_about_type_conversion` must be a flag, not a ', paste0(class(warn_about_type_conversion), collapse = " "))
if(!any(is.numeric(x),is.character(x),is.factor(x))) stop('input to `make_unique` must be a character, numeric, or factor variable')
if(is.factor(x)) {
if(warn_about_type_conversion) warning('make_unique: Converting factor to character variable')
x <- as.character(x)
}
else if(is.numeric(x)) {
if(warn_about_type_conversion) warning('make_unique: Converting numeric variable to a character vector')
x <- as.character(x)
}
deduplicated = stats::ave(x, x, FUN = function(a){
if(length(a) > 1){
suffixes <- seq_along(a)
if(wrap_in_brackets) suffixes <- paste0('(', suffixes, ')')
paste0(a, sep, suffixes)
}
else {a}
})
values_still_duplicated <- deduplicated[duplicated(deduplicated)]
if(length(stats::na.omit(values_still_duplicated)) > 0){
stop(
"make_unique failed to make vector unique.\n",
"This is because appending ' <dup_number>' to duplicate values led to",
"creation of term(s) that were in the original dataset: \n[",
paste0(values_still_duplicated, collapse = ', '),
"]\n\nPlease try again with a different argument for either `wrap_in_brackets` or `sep`"
)
}
return(deduplicated)
}

Related

How to insert a part of a value of a column into a new column

I have not been programming for that long and have now encountered a problem to which I have not yet been able to find a solution.
In my dataframe there is a column that contains several pieces of information. For example, one row looks like this:
sp|O94910|AGRL1_HUMAN
or like this
sp|Q13554|KCC2B_HUMAN;sp|Q13555|KCC2G_HUMAN
Now I want to create a new column with the combination of digits between the two vertical bars.
For the upper example it would be O94910, for the lower Q13554; Q13555
I have already tried functions like str_extract_all, str_match or gsub. But nothing worked.
The "id" is the column I look at. It includes different combinations of digits. I need the one between the two |
> dput(head(anaDiff_PD_vs_CTRL$id, 10))
c("sp|O94910|AGRL1_HUMAN", "sp|P02763|A1AG1_HUMAN", "sp|P19652|A1AG2_HUMAN",
"sp|P25311|ZA2G_HUMAN", "sp|Q8NFZ8|CADM4_HUMAN", "sp|P08174|DAF_HUMAN",
"sp|Q15262|PTPRK_HUMAN", "sp|P78324|SHPS1_HUMAN;sp|Q5TFQ8|SIRBL_HUMAN;sp|Q9P1W8|SIRPG_HUMAN",
"sp|Q8N3J6|CADM2_HUMAN", "sp|P19021|AMD_HUMAN")>
With dplyr and stringr you can try...
library(dplyr)
library(stringr)
dat %>%
rowwise() %>%
mutate(dig = str_extract_all(col, "(?<=sp\\|)[A-Z0-9]+(?=\\|)"),
dig = paste0(dig, collapse = "; "))
#> # A tibble: 4 x 2
#> # Rowwise:
#> col dig
#> <chr> <chr>
#> 1 sp|Q8NFZ8|CADM4_HUMAN Q8NFZ8
#> 2 sp|94910|AGRL1_HUMAN 94910
#> 3 sp|O94910|AGRL1_HUMAN O94910
#> 4 sp|Q13554|KCC2B_HUMAN;sp|Q13555|KCC2G_HUMAN Q13554; Q13555
data
dat <- data.frame(col = c("sp|Q8NFZ8|CADM4_HUMAN", "sp|94910|AGRL1_HUMAN", "sp|O94910|AGRL1_HUMAN", "sp|Q13554|KCC2B_HUMAN;sp|Q13555|KCC2G_HUMAN"))
Created on 2022-02-02 by the reprex package (v2.0.1)
Here is a solution without tidyverse:
dat <- read.table(text = "
sp|Q8NFZ8|CADM4_HUMAN
sp|94910|AGRL1_HUMAN
sp|O94910|AGRL1_HUMAN
sp|Q13554|KCC2B_HUMAN;sp|Q13555|KCC2G_HUMAN")
ids <- strsplit(dat$V1, ";")
ids <- lapply(ids, function(x) gsub("sp\\|([[:alnum:]]*)\\|.*", "\\1", x))
ids <- lapply(ids, function(x) paste(x, collapse="; "))
dat$newcol <- unlist(ids)
Even with tidyverse, I would define a helper function for more clarity:
extract_ids <- function(x) {
ids <- strsplit(x, ";")
ids <- map(ids, ~ gsub("sp\\|([[:alnum:]]*)\\|.*", "\\1", .))
ids <- map(ids, ~ paste(., collapse="; "))
unlist(ids)
}
dat <- dat %>% mutate(ids = extract_ids(V1))
This solution should help if you want to change your column names in a similar fashion:
library(tidyverse)
# create test data frame with column names "sp|O94910|AGRL1_HUMAN" and "sp|Q13554|KCC2B_HUMAN;sp|Q13555|KCC2G_HUMAN"
col1 <- c(1,2,3,4,5)
col2 <- c(6,7,8,9,10)
df <- data.frame(col1, col2)
names(df)[1] <- "sp|O94910|AGRL1_HUMAN"
names(df)[2] <- "sp|Q13554|KCC2B_HUMAN;sp|Q13555|KCC2G_HUMAN"
names <- as.data.frame((str_split(colnames(df), "\\|", simplify = TRUE))) # split the strings representing the column names seperated by "|" into a list
# remove all strings that contain less digits than letters or special characters
for(i in 1:nrow(names)) {
for(j in 1:ncol(names)){
if ( (str_count(as.vector(str_split(names[i,j], "\\|", simplify = TRUE)), "[0-9]") >
str_count(as.vector(str_split(names[i,j], "\\|", simplify = TRUE)), "[:alpha:]|[:punct:]") )){
names[i,j] <- names[i,j]
} else {
names[i,j] <- ""
}
}
}
# combine the list columns into a single column calles "colnames"
names <- names %>% unite("colnames", 1:5, na.rm = TRUE, remove = TRUE, sep = ";")
# remove all ";" separators at the start of the strings, the end of the strings, and series of ";" into a single ";"
for (i in 1:nrow(names)){
names[i,] <- str_replace(names[i,],"\\;+$", "") %>%
str_replace("^\\;+", "") %>%
str_replace("\\;{2}", ";")
}
# convert column with new names into a vector
new_names <- as.vector(names$colnames)
# replace old names with new names
names(df) <- new_names

Is there an R function that reads text files with \n as a (column) delimiter?

The Problem
I'm trying to come up with a neat/fast way to read files delimited by newline (\n) characters into more than one column.
Essentially in a given input file, multiple rows in the input file should become a single row in the output, however most file reading functions sensibly interpret the newline character as signifying a new row, and so they end up as a data frame with a single column. Here's an example:
The input files look like this:
Header Info
2021-01-01
text
...
#
2021-01-02
text
...
#
...
Where the ... represents potentially multiple rows in the input file, and the # signifies what should really be the end of a row in the output data frame. So upon reading this file, it should become a data frame like this (ignoring the header):
X1
X2
...
Xn
2021-01-01
text
...
...
2021-01-02
text
...
...
...
...
...
...
My attempt
I've tried base, data.table, readr and vroom, and they all have one of two outputs, either a data frame with a single column, or a vector. I want to avoid a for loop, and so my current solution is using base::readLines(), to read it as a character vector, then manually adding some "proper" column separators (e.g. ;), and then joining and splitting again.
# Save the example data to use as input
writeLines(c("Header Info", "2021-01-01", "text", "#", "2021-01-02", "text", "#"), "input.txt")
input <- readLines("input.txt")
input <- paste(input[2:length(input)], collapse = ";") # Skip the header
input <- gsub(";#;*", replacement = "\n", x = input)
input <- strsplit(unlist(strsplit(input, "\n")), ";")
input <- do.call(rbind.data.frame, input)
# Clean up the example input
unlink("input.txt")
My code above works and gives the desired result, but surely there's a better way??
Edit: This is internal in a function, so part (perhaps the larger part) of the intention of any simplification is to improve the speed.
Thanks in advance!
1) Read in the data, locate the # signs giving logical variable at and then create a grouping variable g which has distinct values for each desired line. Finally use tapply with paste to rework it into lines that can be read using read.table and read it. (If there are commas in the data then use some other separating character.)
L <- readLines("input.txt")[-1]
at <- grepl("#", L)
g <- cumsum(at)
read.table(text = tapply(L[!at], g[!at], paste, collapse = ","),
sep = ",", col.names = cnames)
giving this data frame:
V1 V2
1 2021-01-01 text
2 2021-01-02 text
2) Another approach is to rework the data into dcf form by removing the # sign and prefacing other lines with their column name and a colon. Then use read.dcf. cnames is a character vector of column names that you want to use.
cnames <- c("Date", "Text")
L <- readLines("input.txt")[-1]
LL <- sub("#", "", paste0(c(paste0(cnames, ": "), ""), L))
DF <- as.data.frame(read.dcf(textConnection(LL)))
DF[] <- lapply(DF, type.convert, as.is = TRUE)
DF
giving this data frame:
Date Text
1 2021-01-01 text
2 2021-01-02 text
3) This approach simply reshapes the data into a matrix and then converts it to a data frame. Note that (1) converts numeric columns to numeric whereas this one just leaves them as character.
L <- readLines("input.txt")[-1]
k <- grep("#", L)[1]
as.data.frame(matrix(L, ncol = k, byrow = TRUE))[, -k]
## V1 V2
## 1 2021-01-01 text
## 2 2021-01-02 text
Benchmark
The question did not mention speed as a consideration but in a comment it was later mentioned. Based on the data in the benchmark below (1) runs over twice as fast as the code in the question and (3) runs nearly 25x faster.
library(microbenchmark)
writeLines(c("Header Info",
rep(c("2021-01-01", "text", "#", "2021-01-02", "text", "#"), 10000)),
"input.txt")
library(microbenchmark)
writeLines(c("Header Info", rep(c("2021-01-01", "text", "#", "2021-01-02", "text", "#"), 10000)), "input.txt")
microbenchmark(times = 10,
ques = {
input <- readLines("input.txt")
input <- paste(input[2:length(input)], collapse = ";") # Skip the header
input <- gsub(";#;*", replacement = "\n", x = input)
input <- strsplit(unlist(strsplit(input, "\n")), ";")
input <- do.call(rbind.data.frame, input)
},
ans1 = {
L <- readLines("input.txt")[-1]
at <- grepl("#", L)
g <- cumsum(at)
read.table(text = tapply(L[!at], g[!at], paste, collapse = ","), sep = ",")
},
ans3 = {
L <- readLines("input.txt")[-1]
k <- grep("#", L)[1]
as.data.frame(matrix(L, ncol = k, byrow = TRUE))[, -k]
})
## Unit: milliseconds
## expr min lq mean median uq max neval cld
## ques 1146.62 1179.65 1188.74 1194.78 1200.11 1219.01 10 c
## ans1 518.95 522.75 548.33 532.59 561.55 647.14 10 b
## ans3 50.47 51.19 51.68 51.69 52.25 52.52 10 a
You can get round some of the string manipulation with something along the lines of:
input <- readLines("input.txt")[-1] #Read in and remove header
ncol <- which(input=="#")[1]-1 #Number of columns of data
data.frame(matrix(input[input != "#"], ncol = ncol, byrow=TRUE)) #Convert to dataframe
# X1 X2
#1 2021-01-01 text
#2 2021-01-02 text
At this point, you might consider going the full mile and use a proper grammar to parse it. I don't know how big or complex the situation really is, but using pegr it might look something like this:
input <-
"Header Info
2021-01-01
text
multiple lines
of
text
#
2021-01-02
text
more
lines of text
#
"
library(pegr)
peg <- new.parser(commonRules,action=TRUE) +
c("HEADER <- 'Header Info' EOL" , "{}" ) + # Rule to match literal 'Header Info' and a \n, then discard
c("TYPE <- 'text' EOL" , "{-}" ) + # Rule to match literal 'text', store paste and store as $TYPE
c("DATE <- (!EOL .)* EOL" , "{-}" ) + # Rule to match any character leading up to a new line. Could improve to look for a date format
c("EOS <- '#' EOL" , "{}" ) + # Rule to match end of section, then discard
c("BODY <- (!EOS .)*" , "{-}" ) + # Rule to match body of text, including newlines
c("SECTION <- DATE TYPE BODY EOS" ) + # Combining rules to match each section
c("DOCUMENT <- HEADER SECTION*" ) # Combining more rules to match the endire document
res <- peg[["DOCUMENT"]](input))
final <- matrix( value(res), ncol=3, byrow=TRUE ) %>%
as.data.frame %>%
setnames( names(value(res))[1:3])
final
Produces:
DATE TYPE BODY
1 2021-01-01 text multiple lines\nof\ntext\n
2 2021-01-02 text more\nlines of text\n
It might feel clunky if you don't know the syntax, but once you do, its a fire and forget solution. It'll run according to spec until the spec doesn't hold. You don't have to worry about fragile pretreatment and it is easy to adapt to changing formats in the future.
There's also the tidyverse way:
library(tidyr)
library(readr)
library(stringr)
max_columns <- 5
d <- {
readr::read_file("file.txt") %>%
stringr::str_remove("^Header Info\n") %>%
tibble::enframe(name = NULL) %>%
separate_rows(value, sep = "#\n") %>%
separate("value", into = paste0("X", 1:max_columns) , sep = "\n")
}
Using your example input in a file called file.txt, the d looks like:
# A tibble: 3 x 5
X1 X2 X3 X4 X5
<chr> <chr> <chr> <chr> <chr>
1 2021-01-01 text ... "" NA
2 2021-01-02 text ... "" NA
3 ... NA NA NA NA
Warning message:
Expected 5 pieces. Missing pieces filled with `NA` in 3 rows [1, 2, 3].
Note that the warning is simply to make sure you know you're getting NA's, this is inevitable if number of rows between # varies
I am using the data similar to that provided by Sirius for demonstration. you can also do something like this to have variable number of columns in resulting data frame
example <- "Header Info
2021-01-01
text
multiple lines
of
text
#
2021-01-02
text
more
lines of text
#"
library(tidyverse)
example %>% as.data.frame() %>% setNames('dummy') %>%
separate_rows(dummy, sep = '\\n') %>%
filter(row_number() !=1) %>%
group_by(rowid = rev(cumsum(rev(dummy == '#')))) %>%
filter(dummy != '#') %>%
mutate(name = paste0('X', row_number())) %>%
pivot_wider(id_cols = rowid, names_from = name, values_from = dummy)
#> # A tibble: 2 x 6
#> # Groups: rowid [2]
#> rowid X1 X2 X3 X4 X5
#> <int> <chr> <chr> <chr> <chr> <chr>
#> 1 2 2021-01-01 text multiple lines of text
#> 2 1 2021-01-02 text more lines of text <NA>
Created on 2021-05-30 by the reprex package (v2.0.0)

function : splitting strings, duplicating rows and replacing the original string by the splitted ones

I'm trying for my first time to code a function. It's supposed to split a string into severals ones and returned each piece into a tibble row.
For example, let's say I have that kind of data.
nasty_entry <- tibble(ID = 1:3, Var = c("ABC", "AB", "A"))
I would like to get that.
nice_entry <- tibble(ID = c(1, 1, 1, 2, 2, 3), var = c("A", "B", "C", "A", "B", "A"))
So, I try to code a function using different kind of loops (for practice) because my orignal data have about 300 entries.
nice_entry <- function(data, var, pattern)
#--------------------DECLARATION--------------------#
# data : The tibble containing the data to split.
# var : The variable containing the data to split.
# pattern : The pattern to use for the spliting.
if(!require(tidyverse)){install.packages("tidyverse")}
library(tidyverse)
if(!require(magrittr)){install.packages("tidyverse")}
library(magrittr)
c1 <- 0 # Reset the counter #1
c2 <- 0 # Reset the counter #2
unchanged_rows <- 0 # The number of rows that has been unchanged.
changed_rows <- 0 # The number of rows that has been changed.
new_data <- tibble() # The tibble where the data will be stored.
repeat{
c1 <- c1 +1 # Increase the counter #1 by one at each loop.
c2 <- 0 # Reset the counter #2 at each loop.
# Split the string into several strings.
splited_str <- str_split(string = data %>% select({{ var }}) %>% slice(c1), pattern = pattern) %>%
unlist()
# Add the row into the "new_data" variable if the original string hasn't been splited.
if(length(splited_str) <= 1) {
unchanged_rows <- unchanged_rows +1
new_data <- new_data %>%
bind_rows(slice(data, c1))
next
}
# Duplicate the row of the original string. It duplicates it several times according to the
# number of times the original string has been splited.
if(length(splited_str) > 1){
changed_rows <- changed_rows +1
duplicated_rows <- data %>%
slice(rep(c1, each = length(splited_str)))
# Replace each original string with the new splited strings.
while (c2 < length(splited_str)) {
c2 <- c2 +1
duplicated_rows <- duplicated_rows %>%
mutate({{ var }} = replace(x = {{ var }}, list = c2, values = splited_str[c2]))
new_data <- new_data %>%
bind_rows(slice(duplicated_rows, c2))
}
}
# Break the loop if the entire tibble has been analyse and return the "new_data" variable.
if(c1 == length(nrow(data))) {
break
return(new_data)
}
}
}
I tried the same code by using "real variables" inside the loops and it seems to work. The problem comes when I embrace them into the function. I get this error.
Error: object 'c1' not found
}
Error: unexpected '}' in " }"
}
Error: unexpected '}' in "}"
What do I do wrong? Maybe it's indexing problem?.
I would also like to have some advices for coding function and if there's alternatives to do the same.
Thank you very much!
Mathieu
Here is another approach you may want to get
library(tidyverse)
nasty_entry2 <- nasty_entry %>%
mutate(Var = strsplit(as.character(Var), "")) %>%
tidyr::unnest(Var)
# A tibble: 6 x 2
# ID Var
# <int> <chr>
# 1 1 A
# 2 1 B
# 3 1 C
# 4 2 A
# 5 2 B
# 6 3 A
We can use separate_rows. Specify a regex lookaround to match between two characters. The . in regex match any character. So, it is basically splitting between two adjacent characters
library(dplyr)
library(tidyr)
nasty_entry %>%
separate_rows(Var, sep="(?<=.)(?=.)")
# A tibble: 6 x 2
# ID Var
# <int> <chr>
#1 1 A
#2 1 B
#3 1 C
#4 2 A
#5 2 B
#6 3 A

Selecting multiple columns using Regular Expressions

I have variables with names such as r1a r3c r5e r7g r9i r11k r13g r15i etc. I am trying to select variables which starts with r5 - r12 and create a dataframe in R.
The best code that I could write to get this done is,
data %>% select(grep("r[5-9][^0-9]" , names(data), value = TRUE ),
grep("r1[0-2]", names(data), value = TRUE))
Given my experience with regular expressions span a day, I was wondering if anyone could help me write a better and compact code for this!
Here's a regex that gets all the columns at once:
data %>% select(grep("r([5-9]|1[0-2])", names(data), value = TRUE))
The vertical bar represents an 'or'.
As the comments have pointed out, this will fail for items such as r51, and can also be shortened. Instead, you will need a slightly longer regex:
data %>% select(matches("r([5-9]|1[0-2])([^0-9]|$)"))
Suppose that in the code below x represents your names(data). Then the following will do what you want.
# The names of 'data'
x <- scan(what = character(), text = "r1a r3c r5e r7g r9i r11k r13g r15i")
y <- unlist(strsplit(x, "[[:alpha:]]"))
y <- as.numeric(y[sapply(y, `!=`, "")])
x[y > 4]
#[1] "r5e" "r7g" "r9i" "r11k" "r13g" "r15i"
EDIT.
You can make a function with a generalization of the above code. This function has three arguments, the first is the vector of variables names, the second and the third are the limits of the numbers you want to keep.
var_names <- function(x, from = 1, to = Inf){
y <- unlist(strsplit(x, "[[:alpha:]]"))
y <- as.integer(y[sapply(y, `!=`, "")])
x[from <= y & y <= to]
}
var_names(x, 5)
#[1] "r5e" "r7g" "r9i" "r11k" "r13g" "r15i"
Remove the non-digits, scan the remainder in and check whether each is in 5:12 :
DF <- data.frame(r1a=1, r3c=2, r5e=3, r7g=4, r9i=5, r11k=6, r13g=7, r15i=8) # test data
DF[scan(text = gsub("\\D", "", names(DF)), quiet = TRUE) %in% 5:12]
## r5e r7g r9i r11k
## 1 3 4 5 6
Using magrittr it could also be written like this:
library(magrittr)
DF %>% .[scan(text = gsub("\\D", "", names(.)), quiet = TRUE) %in% 5:12]
## r5e r7g r9i r11k
## 1 3 4 5 6

R: Apply a function over only character columns without type coercion

I have a data frame with many columns. The columns differ in their types: some are numeric, some are character, etc. Here's a small example where we just have 3 variables with 2 types:
# Generate data
dat <- data.frame(x = c("1","2","3"),
y = c(1.0,2.5,3.3),
z = c(1,2,3),
stringsAsFactors = FALSE)
I want to replace the value 3 with a space, but only for character columns. Here's my current code:
out <- as.data.frame(lapply(dat, function(x) {
ifelse(is.character(x),
gsub("3", " ", x),
x)}),
stringsAsFactors = FALSE)
The problem is that the ifelse() function ignores that y and z are numeric and that it also seems to coerce the numeric variables to character anyway.
And idea has been to pull out the character columns, gsub() them, then bind them back to the original data frame. This, however, changes the ordering of the columns. Key to any solution is that I do not need to specify variables by name but only by type.
One can also do this trivially using dplyr:
# Load package
library(dplyr)
# Create data
dat <- data.frame(x = c("1","2","3"),
y = c(1.0,2.5,3.3),
z = c(1,2,3),
stringsAsFactors = FALSE)
# Replace 3's with spaces for character columns
dat <- dat %>% mutate_if(is.character, function(x) gsub(pattern = "3", " ", x))
I tried your code and for me it seems like ifelse did not work but separating if ad else does. Below is the code which works:
# Generate data
dat <- data.frame(x = c("1","2","3"),
y = c(1.0,2.5,3.3),
z = c(1,2,3),
stringsAsFactors = FALSE)
> lapply(dat, function(x) { if(is.character(x)) gsub("3", " ", x) else x })
$x
[1] "1" "2" " "
$y
[1] 1.0 2.5 3.3
$z
[1] 1 2 3
> as.data.frame(lapply(dat, function(x) { if(is.character(x)) gsub("3", " ", x) else x }))
x y z
1 1 1.0 1
2 2 2.5 2
3 3.3 3
It comes down to this line in ?ifelse
ifelse returns a value with the same shape as test ...
is.character is length one so the returned value is length 1. You can use if(...) yes else no as you have suggested instead as #Heikki have suggested.
Similar to #user3614648 solution:
library(dplyr)
dat %>%
mutate_if(is.character, funs(ifelse(. == "3", " ", .)))
x y z
1 1 1.0 1
2 2 2.5 2
3 3.3 3

Resources