Split Columns in a List of Dataframes R - r

I have a list of data frames which some columns have this special character ->(arrow). Now i do want to loop through this list of data frames and locate columns with this -> (arrow) then the new columns be named with a suffix _old and _new. This is a sample of data frames :
dput(df1)
df1 <- structure(list(v1 = c("reg->joy", "ress", "mer->dls"),
t2 = c("James","Jane", "Egg")),
class = "data.frame", row.names = c(NA, -3L))
dput(df2)
df2 <- structure(list(v1 = c("me", "df", "kl"),
t2 = c("James","Jane->dlt", "Egg"),
t3 = c("James ->may","Jane", "Egg")),
class = "data.frame", row.names = c(NA, -3L))
dput(df3)
df3 <- structure(list(v1 = c("56->34", "df23-> ", "mkl"),
t2 = c("James","Jane", "Egg"),
d3 = c("James->","Jane", "Egg")),
class = "data.frame", row.names = c(NA, -3L))
This is what I have tried
dfs <- list(df1,df2,df3)
for (y in 1:length(dfs)){
setDT(dfs[[y]])
df1<- lapply(names(dfs[[y]]), function(x) {
mDT <- df2[[y]][, tstrsplit(get(x), " *-> *")]
if (ncol(mDT) == 2L) setnames(mDT, paste0(x, c("_old", "_new")))
}) %>% as.data.table()
}
This only splits one data frame, I need to split all of the data frames.
NOTE: The code I have splits so well on one dataframe, what I want is how to implement it on a List of data frames
EXPECTED OUTPUT
dput(df1)
df1 <- structure(list(v1_old = c("reg", "mer"),
v1_new = c("joy", "dls")),
class = "data.frame", row.names = c(NA, -3L))
dput(df2)
df2 <- structure(list(t2_old = c("dlt"),
t2_new = c("dlt"),
t3_old = c("James"),
t3_new = c("may")),
class = "data.frame", row.names = c(NA, -3L))
dput(df3)
df3 <- structure(list(v1_old = c("56", "df23 "),
v1_new = c("34", " "),
d3 = c("James"),
d3 = c(" ")),
class = "data.frame", row.names = c(NA, -3L))

I add below a solution using the tidyverse.
Select the columns if one of the strings in the columns contains an arrow:
col_arrow_ls <- purrr::map(dfs, ~select_if(., ~any(str_detect(., "->"))))
Then split the function using tidyr::separate. Since each element of the output is a data frame, purrr::map_dfc is used to column-bind them together:
split_df_fn <- function(df1){
names(df1) %>%
map_dfc(~ df1 %>%
select(.x) %>%
tidyr::separate(.x,
into = paste0(.x, c("_old", "_new")),
sep = "->")
)
}
Apply the function to the list of data frames.
purrr::map(col_arrow_ls, split_df_fn)
[[1]]
v1_old v1_new
1 reg joy
2 ress <NA>
3 mer dls
[[2]]
t2_old t2_new t3_old t3_new
1 James <NA> James may
2 Jane dlt Jane <NA>
3 Egg <NA> Egg <NA>
[[3]]
v1_old v1_new d3_old d3_new
1 56 34 James
2 df23 Jane <NA>
3 mkl <NA> Egg <NA>

Related

Sum characters from a string using a lookup table in R

I have a lookup dataframe (df1) like this:
col1 col2
A 71
R 156
N 114
D 115
...
and I have a data frame (df2) containing a column of strings like this:
[1] "AARA"
[2] "DDNRRRNRAAN"
[3] "RNDARANDRN"
...
I would like to create a new column in df2 that, for each string, looks up the series of corresponding numbers from df1 and sums them. So, the first row in the new column of df2 would have the value 369 (= 71 + 71 + 156 + 71). How could I go about this task?
One more tidyverse strategy
lookup <- structure(list(col1 = c("A", "R", "N", "D"), col2 = c(71L, 156L,
114L, 115L)), class = "data.frame", row.names = c(NA, -4L))
df <- structure(list(col = c("AARA", "DDNRRRNRAAN", "RNDARANDRN")),
class = "data.frame", row.names = c(NA, -3L))
library(tidyverse)
df %>%
mutate(SUM = map_dbl(str_split(col, ''), ~ sum(lookup$col2[match(.x, lookup$col1)])))
#> col SUM
#> 1 AARA 369
#> 2 DDNRRRNRAAN 1338
#> 3 RNDARANDRN 1182
Created on 2021-06-13 by the reprex package (v2.0.0)
Split the string at every character, use match to get corresponding value for each character and sum them.
df2$res <- sapply(strsplit(df2$col, ''), function(x)
sum(df1$col2[match(x, df1$col1)], na.rm = TRUE))
df2
# col res
#1 AARA 369
#2 DDNRRRNRAAN 1338
#3 RNDARANDRN 1182
Using the same logic a tidyverse option would be -
library(dplyr)
library(tidyr)
df2 %>%
mutate(row = row_number()) %>%
separate_rows(col, sep = '') %>%
left_join(df1, by = c('col' = 'col1')) %>%
group_by(row) %>%
summarise(col = paste0(col, collapse = ''),
col2 = sum(col2, na.rm = TRUE)) %>%
select(-row)
data
df1 <- structure(list(col1 = c("A", "R", "N", "D"), col2 = c(71L, 156L,
114L, 115L)), class = "data.frame", row.names = c(NA, -4L))
df2 <- structure(list(col = c("AARA", "DDNRRRNRAAN", "RNDARANDRN")),
class = "data.frame", row.names = c(NA, -3L))

The first two columns defined as "rownames"

I want to define the first two columns of a data frame as rownames. Actually I want to do some calculations and the data frame has to be numeric for that.
data.frame <- data_frame(id=c("A1","B2"),name=c("julia","daniel"),BMI=c("20","49"))
The values for BMI are numerical (proved with is.numeric), but the over all data.frame not. How to define the first two columns (id and name) as rownames?
Thank you in advance for any suggestions
You can combine id and name column and then assign rownames
data.frame %>%
tidyr::unite(rowname, id, name) %>%
tibble::column_to_rownames()
# BMI
#A1_julia 20
#B2_daniel 49
In base R, you can do the same in steps as
data.frame <- as.data.frame(data.frame)
rownames(data.frame) <- paste(data.frame$id, data.frame$name, sep = "_")
data.frame[c('id', 'name')] <- NULL
Not sure if the code and result below is the thing you are after:
dfout <- `rownames<-`(data.frame(BMI = as.numeric(df$BMI)),paste(df$id,df$name))
such that
> dfout
BMI
A1 julia 20
B2 daniel 49
DATA
df <- structure(list(id = structure(1:2, .Label = c("A1", "B2"), class = "factor"),
name = structure(2:1, .Label = c("daniel", "julia"), class = "factor"),
BMI = structure(1:2, .Label = c("20", "49"), class = "factor")), class = "data.frame", row.names = c(NA,
-2L))

Bind r data.frames that contain column(s) of nested data.frames

After importing multiple .json files using jsonlite I was looking for ways to bind the resulting data.frames which contained one or more columns which themselves were nested data.frames.
I came across the following post https://r.789695.n4.nabble.com/data-frame-with-nested-data-frame-td3162660.html, which helped highlight the problem.
## Create nested data.frames
dat1 <- data.frame(x = 1)
dat1$y <- data.frame(y1 = "a", y2 = "A", stringsAsFactors = FALSE)
dat2 <- data.frame(x = 2)
dat2$y <- data.frame(y1 = "b", stringsAsFactors = FALSE)
None of these work
rbind(dat1, dat2)
dplyr::bind_rows(dat1, dat2)
data.table::rbindlist(list(dat1, dat2))
I've discovered a few workarounds which I'll post below in case they help others.
This could be done without additional packages, too. The data frames need to be partly unlisted within a list and then merged using Reduce.
Reduce(function(...) merge(..., all=TRUE), Map(unlist, list(dat1, dat2), recursive=FALSE))
# x y.y1 y.y2
# 1 1 a A
# 2 2 b <NA>
This also works with more than two nested data frames.
dat3 <- data.frame(x=2, y=data.frame(y1="c", y2="C", z="CC", stringsAsFactors=FALSE))
Reduce(function(...) merge(..., all=TRUE), Map(unlist, list(dat1, dat2, dat3), recursive=FALSE))
# x y.y1 y.y2 y.z
# 1 1 a A <NA>
# 2 2 b <NA> <NA>
# 3 2 c C CC
Data
dat1 <- structure(list(x = 1, y = structure(list(y1 = "a", y2 = "A"), class = "data.frame",
row.names = c(NA, -1L))), row.names = c(NA, -1L),
class = "data.frame")
dat2 <- structure(list(x = 2, y = structure(list(y1 = "b"), class = "data.frame",
row.names = c(NA, -1L))), row.names = c(NA, -1L),
class = "data.frame")
Flatten the data first (for base rbind data.frames need to have identical column names)
dplyr::bind_rows(
jsonlite::flatten(dat1),
jsonlite::flatten(dat2)
)
Put the data.frames into a list before binding (all approaches now work)
dat1$y <- list(dat1$y)
dat2$y <- list(dat2$y)
rbind(dat1, dat2)
dplyr::bind_rows(dat1, dat2)
data.table::rbindlist(list(dat1, dat2))
Use the tidyverse to nest the data.frames
tib1 <- tidyr::nest(dat1, y = c(y))
tib2 <- tidyr::nest(dat2, y = c(y))
tib3 <- dplyr::bind_rows(tib1, tib2)
tidyr::unnest(tib3, c(y))

Filter rows based on one column from a list of dataframes

I have a list of multiple data frames and I would like to filter these data frames in a list by certain values in one column of each data frame. Each data frame in the list has a column called v1, which has special characters ++, ->, Now I do want to filter only rows having this arrow (->) in each data frame in a list. This is a sample of my dataframes,
dput(df)
df1 <- structure(list(v1 = c("->", "++", "->"),
t2 = c("James","Jane", "Egg")),
class = "data.frame", row.names = c(NA, -3L))
dput(df2)
df2 <- structure(list(v1 = c("++", "->", "->"),
t2 = c("James","Jane", "Egg")),
class = "data.frame", row.names = c(NA, -3L))
dput(df3)
df3 <- structure(list(v1 = c("++", "++", "->"),
t2 = c("James","Jane", "Egg"),
d3...c = c("James","Jane", "Egg")),
class = "data.frame", row.names = c(NA, -3L))
I have tried this but I am not getting the dataframes of filtered rows
idx = "->"
dfs <- list(df1,df2,df3)
lapply(dfs, function(x) x$v1 %in% idx)
someone help
idx <- "->"
# Base R
lapply(dfs, function(df) df[df$v1 == "->",])
lapply(dfs, function(df) df[df$v1 %in% idx,])
# tidyverse
library("purrr")
library("dplyr")
map(dfs, filter, v1 == "->")
map(dfs, filter, v1 %in% !! idx)
Try this:
idx <- "->"
fnct <- function(df){df <- df[df$v1 %in% idx, ]}
df1_idx <- fnct(df1)
df2_idx <- fnct(df2)
df3_idx <- fnct(df3)
dfs <- list(df1_idx, df2_idx, df3_idx)
dfs
Result:
[[1]]
v1 t2
1 -> James
3 -> Egg
[[2]]
v1 t2
2 -> Jane
3 -> Egg
[[3]]
v1 t2 d3...c
3 -> Egg Egg

split columns in a list of dataframes in R

I have a list of data frames which some columns have this special character ->(arrow). Now i do want to loop through this list of data frames and locate columns with this -> (arrow) then the new columns be named with a suffix _old and _new. This is a sample of data frames :
dput(df1)
df1 <- structure(list(v1 = c("reg->joy", "ress", "mer->dls"),
t2 = c("James","Jane", "Egg")),
class = "data.frame", row.names = c(NA, -3L))
dput(df2)
df2 <- structure(list(v1 = c("me", "df", "kl"),
t2 = c("James","Jane->dlt", "Egg"),
t3 = c("James ->may","Jane", "Egg")),
class = "data.frame", row.names = c(NA, -3L))
dput(df3)
df3 <- structure(list(v1 = c("56->34", "df23-> ", "mkl"),
t2 = c("James","Jane", "Egg"),
d3 = c("James->","Jane", "Egg")),
class = "data.frame", row.names = c(NA, -3L))
This is what I have tried
dfs <- list(df1,df2,df3)
for (y in 1:length(dfs)){
setDT(dfs[[y]])
df1<- lapply(names(dfs[[y]]), function(x) {
mDT <- df2[[y]][, tstrsplit(get(x), " *-> *")]
if (ncol(mDT) == 2L) setnames(mDT, paste0(x, c("_old", "_new")))
}) %>% as.data.table()
}
This only splits one data frame, I need to split all of the data frames
EXPECTED OUTPUT
dput(df1)
df1 <- structure(list(v1_old = c("reg", "mer"),
v1_new = c("joy", "dls")),
class = "data.frame", row.names = c(NA, -3L))
dput(df2)
df2 <- structure(list(t2_old = c("dlt"),
t2_new = c("dlt"),
t3_old = c("James"),
t3_new = c("may")),
class = "data.frame", row.names = c(NA, -3L))
dput(df3)
df3 <- structure(list(v1_old = c("56", "df23 "),
v1_new = c("34", " "),
d3 = c("James"),
d3 = c(" ")),
class = "data.frame", row.names = c(NA, -3L))
So I have played around and found the answer
df1 <-c()
for (y in 1:length(dfs)){
setDT(dfs[[y]])
df1[[y]] <- lapply(names(modifiedtbl[[y]]), function(x) {
mDT <- dfs[[y]][, tstrsplit(get(x), " *-> *")]
if (ncol(mDT) == 2L) setnames(mDT, paste0(x, c("_old", "_new")))
}) %>% as.data.table()
}

Resources