Split each string into multiple strings in a vector - r

fruits <- c("apple", "orange", "pear")
df <- data.frame(string = c("appleorange",
"orangepear",
"applepear"))
Desired outcome:
string
appleorange
apple
orange
orangepear
orange
pear
applepear
apple
pear

Here is one approach using regex along with sub:
regex <- paste0("(?:", paste(fruits, collapse="|"), ")")
df$col1 <- sub(paste0(regex, "$"), "", df$string)
df$col2 <- sub(paste0("^", regex), "", df$string)
df
string col1 col2
1 appleorange apple orange
2 orangepear orange pear
3 applepear apple pear
Data:
fruits <- c("apple", "orange", "pear")
df <- data.frame(string = c("appleorange", "orangepear", "applepear"))

Here is a solution using stringr package:
library(dplyr)
library(stringr)
df %>%
mutate(col1 = str_extract(string, paste(fruits, collapse = '|')),
col2 = str_replace(string, col1, ''))
string col1 col2
1 appleorange apple orange
2 orangepear orange pear
3 applepear apple pear

Using separate
library(dplyr)
library(stringr)
library(tidyr)
separate(df, string, into = c("col1", "col2"),
sep = glue::glue("(?<=[a-z])(?={str_c(fruits, collapse='|')})"), remove = FALSE)
string col1 col2
1 appleorange apple orange
2 orangepear orange pear
3 applepear apple pear

Related

How to find rows of data.frame that matches a vector in R

pattern <- c("apple", "banana")
dat <- data.frame(fruit1 = c("melon", "apple", "mango", "apple"),
fruit2 = c("banana", "melon", "papaya", "banana"))
> dat
fruit1 fruit2
1 melon banana
2 apple melon
3 mango papaya
4 apple banana
I want to find out if there's a match between pattern and the rows in dat. In the example above, there is a match in the 4th row of dat.
I tried using match, but that does not seem to work on data.frames. An alternative is to loop over each row of dat:
output <- vector()
for(i in 1:nrow(dat)){
output[i] <- all(dat[i, ] %in% pattern)
}
> which(output)
[1] 4
This is inefficient if there are many rows in dat. Is there a faster way?
You could filter the data like
dat |>
subset(fruit1 == pattern[1] & fruit2 == pattern[2])
# fruit1 fruit2
# 4 apple banana
If you just want the index:
which(colSums(t(dat) == pattern) == 2)
# [1] 4
or shorter
which(!colSums(t(dat) != pattern))
# [1] 4
Approach 1: "manual" approach with indexing
library(dplyr)
dat %>%
filter(fruit1 == pattern[1] & fruit2 == pattern[2])
#> fruit1 fruit2
#> 1 apple banana
Approach 2: create a unique key across both data sources, then match with %in%.
This can be especially useful if you want to retain the "ID" that you matched on for future operations. You can remove it at the end with %>% select(-id) however.
ids <- paste0(pattern, collapse = "")
dat %>%
mutate(id = paste0(fruit1, fruit2)) %>%
filter(id %in% ids)
#> fruit1 fruit2 id
#> 1 apple banana applebanana

Joining and replacing columns multiple times

I have a dataframe with a lot of columns with abbreviations. I'm trying to replace the columns with their full name.
A minimal reproducible example:
category <- data.frame(short = c("TOM", "BAN", "APP", "PEA"),
name = c("tomato", "banana", "apple", "pear"))
df <- data.frame(col1 = c("TOM", "TOM", "TOM", "APP", "TOM"),
col2 = c("APP", "TOM", "TOM", "PEA", "PEA"),
col3 = c("TOM", "PEA", "PEA", "TOM", "BAN"))
col1 col2 col3
1 TOM APP TOM
2 TOM TOM PEA
3 TOM TOM PEA
4 APP PEA TOM
5 TOM PEA BAN
Now, I would like my dataframe to just contain the full names of the products. I can get it to work with left_joins, selecting and renaming, but this code is getting out of hand pretty rapidly with a lot of columns.
df2 <- df %>%
left_join(category, by = c("col1" = "short")) %>%
select(-col1) %>%
rename(col1 = name) %>%
left_join(category, by = c("col2" = "short")) %>%
select(-col2) %>%
rename(col2 = name) %>%
left_join(category, by = c("col3" = "short")) %>%
select(-col3) %>%
rename(col3 = name)
col1 col2 col3
1 tomato apple tomato
2 tomato tomato pear
3 tomato tomato pear
4 apple pear tomato
5 tomato pear banana
I think (hope?) there's a better solution for it, but I'm unable to find it.
An option is to create a named vector
library(dplyr)
library(tibble)
v1 <- deframe(category)
and then use that to match and replace the values
df1 <- df %>%
mutate(across(everything(), ~ v1[.]))
-output
df1
# col1 col2 col3
#1 tomato apple tomato
#2 tomato tomato pear
#3 tomato tomato pear
#4 apple pear tomato
#5 tomato pear banana
It can be also done with recode using similar way
df %>%
mutate(across(everything(), ~ recode(., !!! v1)))
Or using base R, create the named vector with setNames, loop over the columns with lapply and replace those values and assign it back
v1 <- with(category, setNames(name, short))
df1 <- df
df1[] <- lapply(df, function(x) v1[x])
Or convert to matrix (a matrix is a vector with dim attributes)
df1[1] <- v1[as.matrix(df)]
Another option is using factor
df[] <- factor(
u <- unlist(df),
labels = with(category, name[match(sort(unique(u)), short)])
)
or a shorter one via setNames
df[]<-with(category,setNames(name,short))[unlist(df)]
which gives
> df
col1 col2 col3
1 tomato apple tomato
2 tomato tomato pear
3 tomato tomato pear
4 apple pear tomato
5 tomato pear banana
You can get the data in long format such that all the values are in one column which is easy to join with category dataframe and then get data back in wide format.
library(dplyr)
library(tidyr)
df %>%
mutate(row = row_number()) %>%
pivot_longer(cols = -row, names_to = 'col', values_to = 'short') %>%
left_join(category, 'short') %>%
select(-short) %>%
pivot_wider(names_from = col, values_from = name) %>%
select(-row)
# col1 col2 col3
# <chr> <chr> <chr>
#1 tomato apple tomato
#2 tomato tomato pear
#3 tomato tomato pear
#4 apple pear tomato
#5 tomato pear banana

Filtering multiple columns of data frame inside a loop in R

I want to use a loop to filter multiple columns of a data frame, removing rows where any of the given column values are in a particular list.
For instance:
> my_df <- data.frame(word1 = c("one", "two", "red", "blue"), word2 = c("apple","orange","banana","pear"), word3 = c("red", "orange", "yellow", "green"))
> color_words = c("red", "orange", "yellow", "green", "blue")
> my_df
word1 word2 word3
1 one apple red
2 two orange orange
3 red banana yellow
4 blue pear green
Using the dplyr filter() function:
> my_df %>% filter(!word1 %in% color_words) %>% filter(!word2 %in% color_words)
word1 word2 word3
1 one apple red
My first attempt to perform this filtering in a loop was:
col_names <- c("word1","word2")
for(col in col_names){
my_df <- my_df %>% filter(!col %in% color_words)
}
> my_df
word1 word2 word3
1 one apple red
2 two orange orange
3 red banana yellow
4 blue pear green
I read about quoting and unquoting when using filter(), so I also tried:
for(col in col_names){
col <- enquo(col)
my_df <- my_df %>% filter(!UQ(col) %in% color_words)
}
> my_df
word1 word2 word3
1 one apple red
2 two orange orange
3 red banana yellow
4 blue pear green
and
for(col in col_names){
my_df <- my_df %>% filter(!UQ(col) %in% color_words)
}
> my_df
word1 word2 word3
1 one apple red
2 two orange orange
3 red banana yellow
4 blue pear green
What is the correct way to go about doing this filtering via a loop?
You don't need a loop you can use filter with across to apply a function for multiple columns
library(dplyr)
my_df %>% filter(across(all_of(col_names), ~!. %in% color_words))
# word1 word2 word3
#1 one apple red
If you have an older version of dplyr, use filter_at :
my_df %>% filter_at(col_names, all_vars(!. %in% color_words))
use base
my_df <- data.frame(word1 = c("one", "two", "red", "blue"), word2 = c("apple","orange","banana","pear"), word3 = c("red", "orange", "yellow", "green"))
color_words <- paste0(c("red", "orange", "yellow", "green", "blue"), collapse = "|")
fltr <- apply(my_df[1:2], 1, function(x) !any(grepl(color_words, x)))
my_df[fltr, ]
#> word1 word2 word3
#> 1 one apple red
Created on 2020-09-25 by the reprex package (v0.3.0)

Transforming a list of lists into dataframe

I have a list containing a number of other lists, each of which contain varying numbers of character vectors, with varying numbers of elements. I want to create a dataframe where each list would be represented as a row and each character vector within that list would be a column. Where the character vector has > 1 element, the elements would be concatenated and separated using a "+" sign, so that they can be stored as one string. The data looks like this:
fruits <- list(
list(c("orange"), c("pear")),
list(c("pear", "orange")),
list(c("lemon", "apple"),
c("pear"),
c("grape"),
c("apple"))
)
The expected output is like this:
fruits_df <- data.frame(col1 = c("orange", "pear + orange", "lemon + apple"),
col2 = c("pear", NA, "pear"),
col3 = c(NA, NA, "grape"),
col4 = c(NA, NA, "apple"))
There is no limit on the number of character vectors that can be contained in a list, so the solution needs to dynamically create columns, leading to a df where the number of columns is equal to the length of the list containing the largest number of character vectors.
For every list in fruits you can create a one row dataframe and bind the data.
dplyr::bind_rows(lapply(fruits, function(x) as.data.frame(t(sapply(x,
function(y) paste0(y, collapse = "+"))))))
# V1 V2 V3 V4
#1 orange pear <NA> <NA>
#2 pear+orange <NA> <NA> <NA>
#3 lemon+apple pear grape apple
This is a bit messy but here is one way
cols <- lapply(fruits, function(x) sapply(x, paste, collapse=" + "))
ncols <- max(lengths(cols))
dd <- do.call("rbind.data.frame", lapply(cols, function(x) {length(x) <- ncols; x}))
names(dd) <- paste0("col", 1:ncol(dd))
dd
# col1 col2 col3 col4
# 1 orange pear <NA> <NA>
# 2 pear + orange <NA> <NA> <NA>
# 3 lemon + apple pear grape apple
or another strategy
ncols <- max(lengths(fruits))
dd <- data.frame(lapply(seq.int(ncols), function(x) sapply(fruits, function(y) paste(unlist(y[x]), collapse=" + "))))
names(dd) <- paste0("col", 1:ncols)
dd
But really you need to either build each column or row from your list and then combine them together.
Another approach that melts the list to a data.frame using rrapply::rrapply and then casts it to the required format using data.table::dcast:
library(rrapply)
library(data.table)
## melt to long data.frame
long <- rrapply(fruits, f = paste, how = "melt", collapse = " + ")
## cast to wide data.table
setDT(long)
dcast(long[, .(L1, L2, value = unlist(value))], L1 ~ L2)[, !"L1"]
#> ..1 ..2 ..3 ..4
#> 1: orange pear <NA> <NA>
#> 2: pear + orange <NA> <NA> <NA>
#> 3: lemon + apple pear grape apple

R: Replacing values in a column with corresponding values

How do you replace values in a column when the value fulfils certain conditions in R?
Here I have two data frames.
Fruits <- c("Apple", "Grape Fruits", "Lemon", "Peach", "Banana", "Orange", "Strawberry", "Apple")
df1 <- data.frame(Fruits)
df1
Fruits
Apple
Grape Fruits
Lemon
Peach
Banana
Orange
Strawberry
Apple
Name <- c("Apple", "Orange", "Lemon", "Grape", "Peach","Pinapple")
Rename <- c("Manzana", "Naranja", "Limon", "Uva", "Melocoton", "Anana")
df2 <- data.frame(Name, Rename)
df2
Name Rename
Apple Manzana
Orange Naranja
Lemon Limon
Grape Uva
Peach Melocoton
Pinapple Anana
I want to replace the values in df1$Fruits to corresponding values in df2$Rename, only when each value in df1$Fruits matches that in df2$Name.
So the designated data frame would be like this.
Fruits
Manzana
Grape Fruits
Limon
Melocoton
Banana
Naranja
Strawberry
Manzana
Does anybody know how to do this? Thank you very much for your help.
using plyr
library(plyr)
new.fruits <- mapvalues(Fruits, from = Name, to = Rename)
df <- data.frame(Fruits=new.fruits)
You can use merge and then replace all NA by their respective fruits.
df3 <- merge(df1,df2, by.x = "Fruits", by.y = "Name", all.x = T)
df3$Rename[is.na(df3$Rename)] <- df3$Fruits[is.na(df3$Rename)]
If you need to keep the order:
df1$id <- 1:nrow(df1)
df3 <- merge(df1,df2, by.x = "Fruits", by.y = "Name", all.x = T)
df3$Rename[is.na(df3$Rename)] <- df3$Fruits[is.na(df3$Rename)]
df3 <- df3[order(df3$id),]
data.frame(Fruits = df3[,"Rename"])
# Fruits
# 1 Manzana
# 2 Grape Fruits
# 3 Limon
# 4 Melocoton
# 5 Banana
# 6 Naranja
# 7 Strawberry
# 8 Manzana
Shorter match solution from #Wen below
df1$new=df2$Rename[match(df1$Fruits,df2$Name)]
df1$new[is.na(df1$new)] <- df1$Fruits[is.na(df1$new)]
Using apply with pmatch can be provide desired output.
df1$Fruits <- apply(df1,1,function(x){
matched = (df2$Name == x)
if(any(matched)){
as.character(df2$Rename[matched])
} else {
x
}})
df1
# Fruits
# 1 Manzana
# 2 Grape Fruits
# 3 Limon
# 4 Melocoton
# 5 Banana
# 6 Naranja
# 7 Strawberry
# 8 Manzana

Resources