Iterate over column names and separate fields recursively with dplyr - r

I want to iterate over column names of the data frame, then using dplyr, separate fields using a delimiter(->) found among the row fields. This is how the dataset looks like :
dput(df)
structure(list(v1 = c("Silva->Mark", "Brandon->Livo", "Mango->Apple"),
v2 = c("Austin", "NA ", "Orange"),
v3 = c("James -> Jacy","NA->Jane", "apple -> Orange")),
class = "data.frame", row.names = c(NA, -3L))
Now I wrote a code that filters out column names with delimiter(->) on rows which are column v1 and column v3. Here is the code:
rows_true <- apply(df,2,function(x) any(sapply(x,function(y)grepl("->",y))))
ss<-df[,rows_true]
Then I tried to loop through those column names so that I can separate using the delimiter using this code but it ain't working
cols<- names(df)
if (names %in% df){
splitcols <- ss %>%
tidyr::separate(cols, into = c(paste0(names,+ "old"), "paste0(names,+ "New")"), sep = "->")
}
The reason I am using paste0 is because I do want the columns split into two using the delimiter then the newly formed columns should be named using the original name plus suffix Old for the first one and New for second split column
End result after looping through column names and recursively separating them should look like this
dput(df)
structure(list(v1_Old = c("Silva", "Brandon", "Mango"),
v1_New = c("Mark", "Livo", "Apple"),
v3_Old = c("James","NA", "apple"),
v3_New = c("Jacy","Jane", "Orange")),
class = "data.frame", row.names = c(NA, -3L))

For the sake of completeness, here is also a solution which uses data.table().
There are some differences to the other answers posted so far:
It is not required to identify the columns to be split beforehand. Instead, columns without "->" are dropped from the result on the fly.
The regular expression which is used for splitting includes surrounding white space (if any)
" *-> *". This avoids to call trimws() on the resulting pieces afterwards or to remove white space beforehand.
.
library(data.table)
library(magrittr) # piping used to improve readability
setDT(df)
lapply(names(df), function(x) {
mDT <- df[, tstrsplit(get(x), " *-> *")]
if (ncol(mDT) == 2L) setnames(mDT, paste0(x, c("_Old", "_New")))
}) %>% as.data.table()
v1_Old v1_New v3_Old v3_New
1: Silva Mark James Jacy
2: Brandon Livo NA Jane
3: Mango Apple apple Orange

One possibility involving dplyr and tidyr could be:
df %>%
select(v1, v3) %>%
rowid_to_column() %>%
gather(var, val, -rowid) %>%
separate_rows(val, sep = "->", convert = TRUE) %>%
group_by(rowid) %>%
mutate(val = trimws(val),
var = make.unique(var)) %>%
ungroup() %>%
spread(var, val) %>%
select(-rowid)
v1 v1.1 v3 v3.1
<chr> <chr> <chr> <chr>
1 Silva Mark James Jacy
2 Brandon Livo <NA> Jane
3 Mango Apple apple Orange
Or to further match the expected output:
df %>%
select(v1, v3) %>%
rowid_to_column() %>%
gather(var, val, -rowid) %>%
separate_rows(val, sep = "->", convert = TRUE) %>%
group_by(rowid, var) %>%
mutate(val = trimws(val),
var2 = if_else(row_number() == 2, paste0(var, "_old"), paste0(var, "_new"))) %>%
ungroup() %>%
select(-var) %>%
spread(var2, val) %>%
select(-rowid)
v1_new v1_old v3_new v3_old
<chr> <chr> <chr> <chr>
1 Silva Mark James Jacy
2 Brandon Livo <NA> Jane
3 Mango Apple apple Orange

A different approach with dplyr, purr, and stringr is the following.
library(dplyr)
library(purrr)
library(stringr)
# Detect the columns with at least on "->"
my_df_cols <- map_lgl(my_df, ~ any(str_detect(., "->")))
my_df %>%
# Select only the columns with at least "->"
select(which(my_df_cols)) %>%
# Mutate these columns and only keep the mutated columns with new names
transmute_all(list(old = ~ str_split(., "->", simplify = TRUE)[, 1],
new = ~ str_split(., "->", simplify = TRUE)[, 2]))
# v1_old v3_old v1_new v3_new
# 1 Silva James Mark Jacy
# 2 Brandon NA Livo Jane
# 3 Mango apple Apple Orange

We can also use cSplit from splitstackshape
#Detect columns with "->"
cols <- names(df)[colSums(sapply(df, grepl, pattern = "->")) > 1]
#Remove unwanted whitespaces before and after "->"
df[cols] <- lapply(df[cols], function(x) gsub("\\s+", "", x))
#Split into new columns specifying sep as "->"
splitstackshape::cSplit(df[cols], cols, sep = "->")
# v1_1 v1_2 v3_1 v3_2
#1: Silva Mark James Jacy
#2: Brandon Livo <NA> Jane
#3: Mango Apple apple Orange

Related

convert json list to a data frame in R

I have a json file as follows:
{
"1234":{"Messages":{"1":{"Content":["How are you","today"]},"2":{"Content":["I am great"]}}},
"2344":{"Messages":{"1":{"Content":["It's a plan"]}}}}
I am trying to convert this content to this data frame:
df <- data.frame(id=c(1234,2344), Content1=c("How are you today","It's a plan"), Content2=c("I am great", ""))
I have tried a few things with jsonlite and pluck but challenged over the iterative part of the code.
Any advice appreciated.
Thank you.
We could read the .json into a list with fromJSON and then get the 'Content' with rrapply and convert to a data.frame
library(jsonlite)
library(rrapply)
library(dplyr)
library(tidyr)
lst1 <- fromJSON("file1.json")
rrapply(lst1, condition = function(x, .xname)
.xname == 'Content', how = "melt") %>%
select(-L2) %>%
unite(L4, L4, L3, sep = "") %>%
unnest(value) %>%
pivot_wider(names_from = L4, values_from = value, values_fn = toString)
-output
# A tibble: 2 × 3
L1 Content1 Content2
<chr> <chr> <chr>
1 1234 How are you, today I am great
2 2344 It's a plan <NA>
This is admittedly a hack:
data.table::rbindlist(
lapply(rapply(L, function(z) paste(z, collapse = " "), how = "replace"),
function(z) as.data.frame(z$Messages)),
fill = TRUE, idcol = "id")
# id Content Content.1
# <char> <char> <char>
# 1: 1234 How are you today I am great
# 2: 2344 It's a plan <NA>
It also works with dplyr if you prefer:
dplyr::bind_rows(
lapply(rapply(L, function(z) paste(z, collapse = " "), how = "replace"),
function(z) as.data.frame(z$Messages)),
.id = "id")
Within tidyverse we could pivot_longer followed by unnest_wider:
library(tidyr)
library(tibble)
library(jsonlite)
jsontext |>
fromJSON() |>
as_tibble() |>
pivot_longer(everything()) |>
unnest_wider(value, transform = ~paste(unlist(.), collapse = " "), names_sep = "_")
Output
# A tibble: 2 × 3
name value_1 value_2
<chr> <chr> <chr>
1 1234 How are you today "I am great"
2 2344 Its a plan ""

How to get unique occurrences of these character strings separated by ";"?

So I have a column with values in this structure:
tribble(
~col,
"AA_BB;AA_AA;AA_BB",
"BB_BB;AA_AA",
"AA_BB",
"BB_AA;BB_AA;AA_AA;BB_AA")
)
So each row has items separated by a ";". The first for has items AA_BB, AA_AA and AA_BB. I want the first row to be transformed to "AA_BB;AA_AA" and the last row to be transformed to "BB_AA;AA_AA".
I thought about using separate but I the result didn't really help me (especially since I don't know how many columns there can be at most).
df %>%
separate(col, into = c("A", "B", "C", "D"), sep = ";")
Any tips on how to do this?
We can split the column, get the unique elements and paste
library(dplyr)
library(stringr)
library(purrr)
df %>%
mutate(col = map_chr(strsplit(col, ";"), ~ str_c(unique(.x), collapse=";")))
-output
# A tibble: 4 x 1
# col
# <chr>
#1 AA_BB;AA_AA
#2 BB_BB;AA_AA
#3 AA_BB
#4 BB_AA;AA_AA
Or split with separate_rows, then do a group by paste after getting the distinct rows
library(tidyr)
df %>%
mutate(rn = row_number()) %>%
separate_rows(col, sep=";") %>%
distinct %>%
group_by(rn) %>%
summarise(col = str_c(col, collapse=";"), .groups = 'drop') %>%
select(col)
In base R, you can split the string on semi-colon, keep only unique strings and paste them together.
df$col1 <- sapply(strsplit(df$col, ';'), function(x)
paste0(unique(x), collapse = ';'))
df
# A tibble: 4 x 2
# col col1
# <chr> <chr>
#1 AA_BB;AA_AA;AA_BB AA_BB;AA_AA
#2 BB_BB;AA_AA BB_BB;AA_AA
#3 AA_BB AA_BB
#4 BB_AA;BB_AA;AA_AA;BB_AA BB_AA;AA_AA

How to remove everything from a row except pattern

I have a dataframe that contains one column separated by ; like this
AB00001;09843;AB00002;GD00001
AB84375;34
AB84375;AB84375
74859375;AB001;4455;FG3455
What I want is remove everything except the codes that starts with AB....
AB00001;AB00002
AB84375
AB84375;AB84375
AB001
I've tried to separate them with separate(), but I don´t know how to continue. Any suggestions?
If your data frame is called df and your column is called V1, you could try:
sapply(strsplit(df$V1, ";"), function(x) paste(grep("^AB", x, value = TRUE), collapse = ";"))
#> [1] "AB00001;AB00002" "AB84375" "AB84375;AB84375" "AB001"
This splits at all the semicolons then matches all strings starting with "AB", then joins them back together with semicolons.
I thought of using stringr and Daniel O's data:
df %>%
mutate(data = str_extract_all(data, "AB\\w+"))
which gives us
data
1 AB00001, AB00002
2 AB84375
3 AB84375, AB84375
4 AB001
1) Base R Assuming DF shown reproducibly in the Note at the end we prefix each line with a semicolon and then use the gsub with the pattern shown and finally remove the semicolon we added. No packages are used.
transform(DF, V1 = sub("^;", "", gsub("(;AB\\d+)|;[^;]*", "\\1", paste0(";", V1))))
giving:
V1
1 AB00001;AB00002
2 AB84375
3 AB84375;AB84375
4 AB001
2) dplyr/tidyr This one is longer than the others in this answer but it is straight forward and has no complex regular expressions.
library(dplyr)
library(tidyr)
DF %>%
mutate(id = 1:n()) %>%
separate_rows(V1, sep = ";") %>%
filter(substr(V1, 1, 2) == "AB") %>%
group_by(id) %>%
summarize(V1 = paste(V1, collapse = ";")) %>%
ungroup %>%
select(-id)
giving:
# A tibble: 4 x 1
V1
<chr>
1 AB00001;AB00002
2 AB84375
3 AB84375;AB84375
4 AB001
3) gsubfn Replace codes that do not start with AB with an empty string and then remove redundant semicolons from what is left.
library(gsubfn)
transform(DF, V1 = gsub("^;|;$", "", gsub(";+", ";",
gsubfn("[^;]*", ~ if (substr(x, 1, 2) == "AB") x else "", V1))))
giving:
V1
1 AB00001;AB00002
2 AB84375
3 AB84375;AB84375
4 AB001
Note
Lines <- "AB00001;09843;AB00002;GD00001
AB84375;34
AB84375;AB84375
74859375;AB001;4455;FG3455"
DF <- read.table(text = Lines, as.is = TRUE, strip.white = TRUE)

R: How to mutate new ID by modifying previous ID?

I asked the question(How to mutate a new column by modifying another column?)
Now I have another problem. I have to use more 'untidy'IDs like,
df1 <- data.frame(id=c("A-1","A-10","A-100","b-1","b-10","b-100"),n=c(1,2,3,4,5,6))
from this IDs, I want to assign new 'tidy' IDs like,
df2 <- data.frame(id=c("A0001","A0010","A0100","B0001","B0010","B0100"),n=c(1,2,3,4,5,6))
(now I need capital 'B' instead of 'b')
I tried to use str_pad functiuon, but I couldn't manage.
We can separate the data into different columns based on "-", convert the letters to uppercase, using sprintf pad with 0's and combine the two columns with unite.
library(dplyr)
library(tidyr)
df1 %>%
separate(id, c("id1", "id2"), sep = "-") %>%
mutate(id1 = toupper(id1),
id2 = sprintf('%04s', id2)) %>%
unite(id, id1, id2, sep = "")
# id n
#1 A0001 1
#2 A0010 2
#3 A0100 3
#4 B0001 4
#5 B0010 5
#6 B0100 6
Based on the comment if there are cases where we don't have separator and we want to change certain id1 values we can use the following.
df1 %>%
extract(id, c("id1", "id2"), regex = "([:alpha:])-?(\\d+)") %>%
mutate(id1 = case_when(id1 == 'c' ~ 'B',
TRUE ~ id1),
id1 = toupper(id1),id2 = sprintf('%04s', id2)) %>%
unite(id, id1, id2, sep = "")
The str_pad function is handy for this purpose, as you said. But you have to extract out the digits first and then paste it all back together.
library(stringr)
paste0(toupper(str_extract(df1$id, "[aA-zZ]-")),
str_pad(str_extract(df1$id, "\\d+"), width=4, pad="0"))
[1] "A-0001" "A-0010" "A-0100" "B-0001" "B-0010" "B-0100"
Base R solution
df1$id <- sub("^(.)0+?(.{4})$","\\1\\2", sub("-", "0000", toupper(df1$id)))
tidyverse solution
library(tidyverse)
df1$id <- str_to_upper(df1$id) %>%
str_replace("-","0000") %>%
str_replace("^(.)0+?(.{4})$","\\1\\2")
Output
df1
# id n
# 1 A0001 1
# 2 A0010 2
# 3 A0100 3
# 4 B0001 4
# 5 B0010 5
# 6 B0100 6
Data
df1 <- data.frame(id=c("A-1","A-10","A-100","b-1","b-10","b-100"),n=c(1,2,3,4,5,6))

R: How do I c() nested character vectors grouped by another column?

I have strings containing enumerations of words grouped under word type. The example below only has one type for simplicity's sake.
ka = tibble(
words = c('apple, orange', 'pear, apple, plum'),
type = 'fruit'
)
I want to find out the number of UNIQUE words per type.
I figured I would split the character vectors,
ka = ka %>%
mutate(
word_list = str_split(words, ', ')
)
and then bind the columns per group. The end result would be
c(
ka$word_list[[1]],
ka$word_list[[2]],
)
Then I can unique these vectors and get their length.
I don't know how to bind columns together, grouped by a separate column. I could do this with an ugly loop within a loop, but there must be a map/apply solution as well, following the logic of:
ka %>%
group_by(type) %>%
summarise(
biglist = map(word_list, ~ c(.)), # this doesn't work, obviously
biglist_unique = map(biglist, ~ unique(.)),
biglist_length = map(biglist_unique, ~ length(.))
)
Here is an option for you. First we collapse the vectors, then we map out what you're looking for. Note that we have to trim off the whitespace to get the proper unique words.
library(tidyverse)
ka %>%
group_by(type) %>%
summarise(all_words = paste(words, collapse = ",")) %>%
mutate(biglist = str_split(all_words, ",") %>% map(., ~str_trim(.x, "both")),
biglist_unique = map(biglist, ~.x[unique(.x)]),
biglist_length = map_dbl(biglist_unique, length))
#> # A tibble: 1 x 5
#> type all_words biglist biglist_unique biglist_length
#> <chr> <chr> <list> <list> <dbl>
#> 1 fruit apple, orange,pear, apple, plum <chr [5]> <chr [4]> 4
Another option would be to use tidy data principles and the tidyr package.
ka = ka %>%
mutate(
word_list = str_split(words, ', ')
)
ka %>%
# If you need to maintain information about each row you can create an index
# mutate(index = row_number()) %>%
# unnest the wordlist to get one word per row
unnest(word_list) %>%
# Only keep unique words per group
group_by(type) %>%
distinct(word_list, .keep_all = FALSE) %>% # if you need to maintain row info .keep_all = TRUE
summarise(n_unique = n())
# A tibble: 1 x 2
# type n_unique
# <chr> <int>
# 1 fruit 4
Here's a way you can do using separate_rows:
ka %>%
separate_rows(words, sep = ', ') %>%
group_by(type) %>%
summarise(word_c = n_distinct(words))
Something like this:
library(tidyverse)
ka %>%
mutate(words = strsplit(as.character(words), ",")) %>%
unnest(words) %>%
mutate(words = gsub(" ","",words)) %>%
group_by(type) %>%
summarise(number = n_distinct(words),
words = paste0(unique(words), collapse =' '))
# A tibble: 1 x 3
type number words
<chr> <int> <chr>
1 fruit 4 apple orange pear plum

Resources