Turning a text column into a vector in r - r

I want to see whether the text column has elements outside the specified values of "a" and "b"
specified_value=c("a","b")
df=data.frame(key=c(1,2,3,4),text=c("a,b,c","a,d","1,2","a,b")
df_out=data.frame(key=c(1,2,3),text=c("c","d","1,2",NA))
This is what I have tried:
df=df%>%mutate(text_vector=strsplit(text, split=","),
extra=text_vector[which(!text_vector %in% specified_value)])
But this doesn't work, any suggestions?

We can split the 'text' by the delimiter , with separate_rows, grouped by 'key', get the elements that are not in 'specified_value' with setdiff and paste them together (toString), then do a join to get the other columns in the original dataset
library(dplyr) # >= 1.0.0
library(tidyr)
df %>%
separate_rows(text) %>%
group_by(key) %>%
summarise(extra = toString(setdiff(text, specified_value))) %>%
left_join(df) %>%
mutate(extra = na_if(extra, ""))
# A tibble: 4 x 3
# key extra text
# <dbl> <chr> <chr>
#1 1 c a,b,c
#2 2 d a,d
#3 3 1, 2 1,2
#4 4 <NA> a,b

Using setdiff.
df$outside <- sapply({
x <- lapply(strsplit(df$text, ","), setdiff, specified_value)
replace(x, lengths(x) == 0, NA)},
paste, collapse=",")
df
# key text outside
# 1 1 a,b,c c
# 2 2 a,d d
# 3 3 1,2 1,2
# 4 4 a,b NA
Data:
df <- structure(list(key = c(1, 2, 3, 4), text = c("a,b,c", "a,d",
"1,2", "a,b")), class = "data.frame", row.names = c(NA, -4L))
specified_value <- c("a", "b")

use stringi::stri_split_fixed
library(stringi)
!all(stri_split_fixed("a,b", ",", simplify=T) %in% specified_value) #FALSE
!all(stri_split_fixed("a,b,c", ",", simplify=T) %in% specified_value) #TRUE

An option using regex without splitting the data on comma :
#Collapse the specified_value in one string and remove from text
df$text1 <- gsub(paste0(specified_value, collapse = "|"), '', df$text)
#Remove extra commas
df$text1 <- gsub('(?<![a-z0-9]),', '', df$text1, perl = TRUE)
df
# key text text1
#1 1 a,b,c c
#2 2 a,d d
#3 3 1,2 1,2
#4 4 a,b

Related

Collapsing Columns in R using tidyverse with mutate, replace, and unite. Writing a function to reuse?

Data:
ID
B
C
1
NA
x
2
x
NA
3
x
x
Results:
ID
Unified
1
C
2
B
3
B_C
I'm trying to combine colums B and C, using mutate and unify, but how would I scale up this function so that I can reuse this for multiple columns (think 100+), instead of having to write out the variables each time? Or is there a function that's already built in to do this?
My current solution is this:
library(tidyverse)
Data %>%
mutate(B = replace(B, B == 'x', 'B'), C = replace(C, C == 'x', 'C')) %>%
unite("Unified", B:C, na.rm = TRUE, remove= TRUE)
We may use across to loop over the column, replace the value that corresponds to 'x' with column name (cur_column())
library(dplyr)
library(tidyr)
Data %>%
mutate(across(B:C, ~ replace(., .== 'x', cur_column()))) %>%
unite(Unified, B:C, na.rm = TRUE, remove = TRUE)
-output
ID Unified
1 1 C
2 2 B
3 3 B_C
data
Data <- structure(list(ID = 1:3, B = c(NA, "x", "x"), C = c("x", NA,
"x")), class = "data.frame", row.names = c(NA, -3L))
Here are couple of options.
Using dplyr -
library(dplyr)
cols <- names(Data)[-1]
Data %>%
rowwise() %>%
mutate(Unified = paste0(cols[!is.na(c_across(B:C))], collapse = '_')) %>%
ungroup -> Data
Data
# ID B C Unified
# <int> <chr> <chr> <chr>
#1 1 NA x C
#2 2 x NA B
#3 3 x x B_C
Base R
Data$Unified <- apply(Data[cols], 1, function(x)
paste0(cols[!is.na(x)], collapse = '_'))

How to move dataframe variable names to first row and add new variable names to multiple dataframes in a list?

library(purrr)
library(tibble)
library(dplyr)
Starting list of dataframes
lst <- list(df1 = data.frame(X.1 = as.character(1:2),
heading = letters[1:2]),
df2 = data.frame(X.32 = as.character(3:4),
another.topic = paste("Line ", 1:2)))
lst
#> $df1
#> X.1 heading
#> 1 1 a
#> 2 2 b
#>
#> $df2
#> X.32 another.topic
#> 1 3 Line 1
#> 2 4 Line 2
Expected "combined" dataframe, with new consistent variable names, and old variable names in the first row of each constituent dataframe.
#> id h1 h2
#> 1 df1 X.1 heading
#> 2 df1 1 a
#> 3 df1 2 b
#> 4 df2 X.32 another.topic
#> 5 df2 3 Line 1
#> 6 df2 4 Line 2
add_row requires "Name-value pairs, passed on to tibble(). Values can be defined only for columns that already exist in .data and unset columns will get an NA value."
Which is what I think I have achieved with this:
df_nms <-
map(lst, names) %>%
map(set_names)
#> $df1
#> X.1 heading
#> "X.1" "heading"
#>
#> $df2
#> X.32 another.topic
#> "X.32" "another.topic"
But I cannot tie up the last bit, using a purrr function to add the names to the head of each dataframe. I've tried numerous variations with map2 and pmap the closest I can get at present (if I treat add_row as a formula , prefixing it with ~ and remove the .y I get a new first row populated with NAs). I think I'm missing how to pass the name-value pairs to the add_row function.
map2(lst, df_nms, add_row(.x, .y, .before = 1)) %>%
map(set_names, c("h1", "h2")) %>%
map_dfr(bind_rows, .id = "id")
#> Error in add_row(.x, .y, .before = 1): object '.x' not found
A pointer to resolve this last step would be most appreciated.
Not quite sure how to do this via purrr map functions, but here is an alternative,
library(dplyr)
bind_rows(lapply(lst, function(i){d1 <- as.data.frame(matrix(names(i), ncol = ncol(i)));
rbind(d1, setNames(i, names(d1)))}), .id = 'id')
# id V1 V2
#1 df1 X.1 heading
#2 df1 1 a
#3 df1 2 b
#4 df2 X.32 another.topic
#5 df2 3 Line 1
#6 df2 4 Line 2
Here's an approach using map, rbindlist from data.table and some base R functions:
library(purrr)
library(dplyr)
library(data.table)
map(lst, ~ as.data.frame(unname(rbind(colnames(.x),as.matrix(.x))))) %>%
rbindlist(idcol = "id")
# id V1 V2
#1: df1 X.1 heading
#2: df1 1 a
#3: df1 2 b
#4: df2 X.32 another.topic
#5: df2 3 Line 1
#6: df2 4 Line 2
Alternatively we could use map_df if we use colnames<-:
map_df(lst, ~ as.data.frame(rbind(colnames(.x),as.matrix(.x))) %>%
`colnames<-`(.,paste0("h",seq(1,dim(.)[2]))), .id = "id")
# id h1 h2
#1 df1 X.1 heading
#2 df1 1 a
#3 df1 2 b
#4 df2 X.32 another.topic
#5 df2 3 Line 1
#6 df2 4 Line 2
Key things here are:
Use as.matrix to get rid of the factor / character incompatibility.
Remove names with unname or set them with colnames<-
Use the idcols = or .id = feature to get the names of the list as a column.
I altered your sample data a bit, setting stringsAsFactors to FALSE when creating the data.frames in lst.
here is a solution using data.table::rbindlist().
#sample data
lst <- list(df1 = data.frame(X.1 = as.character(1:2),
heading = letters[1:2],
stringsAsFactors = FALSE), # !! <--
df2 = data.frame(X.32 = as.character(3:4),
another.topic = paste("Line ", 1:2),
stringsAsFactors = FALSE) # !! <--
)
DT <- data.table::rbindlist( lapply( lst, function(x) rbind( names(x), x ) ),
use.names = FALSE, idcol = "id" )
setnames(DT, names( lst[[1]] ), c("h1", "h2") )
# id h1 h2
# 1: df1 X.1 heading
# 2: df1 1 a
# 3: df1 2 b
# 4: df2 X.32 another.topic
# 5: df2 3 Line 1
# 6: df2 4 Line 2

How to split string vector by first desired symbol in R?

Here is an example.
library(tidyverse)
df<-data.frame(x=c("a-b-c", "b-d","c-d_e-f"))
df
# x
#1 a-b-c
#2 b-d
#3 c-d_e-f
What I wanted is to separate the column x by the first -, thus a desired output would be:
x y
1 a b-c
2 b d
3 c d_e-f
1) separate Use separate in the tidyr package:
library(dplyr)
library(tidyr)
df %>%
separate(x, c("x", "y"), sep = "-", extra = "merge")
giving:
x y
1 a b-c
2 b d
3 c d_e-f
2) Base R Without using any packages we can use read.table. Replace the first minus with space first.
read.table(text = sub("-", " ", df$x), col.names = c("x", "y"), as.is = TRUE)
giving:
x y
1 a b-c
2 b d
3 c d_e-f
3) read.fwf If the first field is always one character, as it is in the question, then we can use read.fwf
read.fwf(file = textConnection(as.character(df$x)), widths = c(1, 1, 99),
colClasses = c(NA, "NULL", NA), col.names = c("x", NA, "y"))
giving:
x y
1 a b-c
2 b d
3 c d_e-f
We can use extract
library(tidyr)
library(dplyr)
df %>%
extract(x, into = c('x', 'y'), '^([^-])+-(.*)')
Or with separate with making use of extra
df %>%
separate(x, into = c('x', 'y'), '-', extra = 'merge')
# x y
#1 a b-c
#2 b d
#3 c d_e-f

Remove period and spaces within column headings nested in a list of data frames

I have a list of data frames:
mylist<-list(df1=data.frame(var1=c("a","b","c"), var.2=
c("a","b","c")), df2= data.frame(var1 = c("a","b","c"),
var..2=c("a","b","c")))
I would like to remove periods and spaces within the column headings of each data frame within the list. The output would look like:
mylist<-list(df1=data.frame(var1=c("a","b","c"), var2=
c("a","b","c")), df2= data.frame(var1= c("a","b","c"),
var2=c("a","b","c")))
I have tried the following:
cleandf <- lapply(ldf, function(x) x[(colnames(x) <- gsub(".", "",
colnames(x), fixed = TRUE))])
With Base R setNames:
lapply(mylist, function(x) setNames(x, gsub("\\.", "", names(x))))
or with tidyverse:
library(tidyverse)
map(mylist, ~rename_all(.x, str_replace_all, "\\.", ""))
Output:
$df1
var1 var2
1 a a
2 b b
3 c c
$df2
var1 var2
1 a a
2 b b
3 c c
I rename the columns in each data frame and then return the data frame. As explained here, double backslashes are needed as escape characters for the period.
lapply(mylist, function(x){names(x) <- gsub("\\.", "", names(x));x})
# $`df1`
# var1 var2
# 1 a a
# 2 b b
# 3 c c
#
# $df2
# var1 var2
# 1 a a
# 2 b b
# 3 c c

Applying tidyr to separate only specific rows by specifying which rows to exclude

I would like to separate a column by a condition that excludes certain rows. This is a minor variation on this question: Applying tidyr separate only to specific rows But instead of specifying which rows to separate, I'd like to specify which rows to exclude from separating.
For example, lets say we want to split all rows of the 'text' column, except for the ones that have here_do in them:
#creating DF for the example
df <- data.frame(var_a = letters[1:5],
var_b = c(sample(1:100, 5)),
text = c("foo_bla",
"here_do",
"oh_yes",
"ba_a",
"lan_d"))
I guess there would be some way of using extract as we see in the related question, but I can't seem to figure out how to modify the "(here)_(do)" part to make it work:
library(tidyr)
extract(df, text, into = c("first", "sec"), "(here)_(do)", remove = FALSE)
If you don't mind using "data.table" instead, you can try:
library(data.table)
setDT(df)[!text %in% "here_do", c("first", "second") := tstrsplit(text, "_")][]
# var_a var_b text first second
# 1: a 40 foo_bla foo bla
# 2: b 4 here_do NA NA
# 3: c 12 oh_yes oh yes
# 4: d 35 ba_a ba a
# 5: e 11 lan_d lan d
One way is to separate everything then "unseparate" the rows you wanted to exlude.
library('tidyverse')
df <- data.frame(var_a = letters[1:5],
var_b = c(sample(1:100, 5)),
text = c("foo_bla",
"here_do",
"oh_yes",
"ba_a",
"lan_d"),
stringsAsFactors = F)
df %>%
separate(text, c('first_val', 'second_val'), remove = F) %>%
mutate(
first_val = ifelse(text == 'here_do', text, first_val),
second_val = ifelse(text == 'here_do', NA, first_val))
#> var_a var_b text first_val second_val
#> 1 a 45 foo_bla foo foo
#> 2 b 43 here_do here_do <NA>
#> 3 c 81 oh_yes oh oh
#> 4 d 33 ba_a ba ba
#> 5 e 15 lan_d lan lan
We can filter out the row that you do not want to separate, separate the rest of the rows, and then join the result back to the original data frame.
library(dplyr)
library(tidyr)
df2 <- df %>%
filter(!(text %in% "here_do")) %>%
separate(text, into = c("First", "Second"), remove = FALSE) %>%
right_join(df, by = c("var_a", "var_b", "text"))
df2
# var_a var_b text First Second
# 1 a 19 foo_bla foo bla
# 2 b 90 here_do <NA> <NA>
# 3 c 21 oh_yes oh yes
# 4 d 6 ba_a ba a
# 5 e 15 lan_d lan d
DATA
set.seed(244)
df <- data.frame(var_a = letters[1:5],
var_b = c(sample(1:100, 5)),
text = c("foo_bla",
"here_do",
"oh_yes",
"ba_a",
"lan_d"))

Resources