split columns in a list of dataframes in R

split columns in a list of dataframes in R - r

I have a list of data frames which some columns have this special character ->(arrow). Now i do want to loop through this list of data frames and locate columns with this -> (arrow) then the new columns be named with a suffix _old and _new. This is a sample of data frames :
dput(df1)
df1 <- structure(list(v1 = c("reg->joy", "ress", "mer->dls"),
t2 = c("James","Jane", "Egg")),
class = "data.frame", row.names = c(NA, -3L))
dput(df2)
df2 <- structure(list(v1 = c("me", "df", "kl"),
t2 = c("James","Jane->dlt", "Egg"),
t3 = c("James ->may","Jane", "Egg")),
class = "data.frame", row.names = c(NA, -3L))
dput(df3)
df3 <- structure(list(v1 = c("56->34", "df23-> ", "mkl"),
t2 = c("James","Jane", "Egg"),
d3 = c("James->","Jane", "Egg")),
class = "data.frame", row.names = c(NA, -3L))
This is what I have tried
dfs <- list(df1,df2,df3)
for (y in 1:length(dfs)){
setDT(dfs[[y]])
df1<- lapply(names(dfs[[y]]), function(x) {
mDT <- df2[[y]][, tstrsplit(get(x), " *-> *")]
if (ncol(mDT) == 2L) setnames(mDT, paste0(x, c("_old", "_new")))
}) %>% as.data.table()
}
This only splits one data frame, I need to split all of the data frames
EXPECTED OUTPUT
dput(df1)
df1 <- structure(list(v1_old = c("reg", "mer"),
v1_new = c("joy", "dls")),
class = "data.frame", row.names = c(NA, -3L))
dput(df2)
df2 <- structure(list(t2_old = c("dlt"),
t2_new = c("dlt"),
t3_old = c("James"),
t3_new = c("may")),
class = "data.frame", row.names = c(NA, -3L))
dput(df3)
df3 <- structure(list(v1_old = c("56", "df23 "),
v1_new = c("34", " "),
d3 = c("James"),
d3 = c(" ")),
class = "data.frame", row.names = c(NA, -3L))

So I have played around and found the answer
df1 <-c()
for (y in 1:length(dfs)){
setDT(dfs[[y]])
df1[[y]] <- lapply(names(modifiedtbl[[y]]), function(x) {
mDT <- dfs[[y]][, tstrsplit(get(x), " *-> *")]
if (ncol(mDT) == 2L) setnames(mDT, paste0(x, c("_old", "_new")))
}) %>% as.data.table()
}

Related

Check if value from one data frame can be found in another data frame in R

I have two data.frames as follows:
a$id <- as.data.frame(c("1-23-2", "2-3-231-2", "122-121"))
b$id <- as.data.frame(c("1-23-2", "122-121", "12-1223-12", "1221-12"))
I want to check, if all values of a can be found in b.
I tried this:
if (a$id %in% b$id){a$test <- "yes"} else {a$test <- "no"}
Which gives a warning message and the wrong result unfortunately.

Use ifelse.
a$test <- ifelse(a$id %in% b$id, "yeah", "no")
a
# id test
# 1 1-23-2 yeah
# 2 2-3-231-2 no
# 3 122-121 yeah
Data
a <- structure(list(id = structure(c(1L, 3L, 2L), .Label = c("1-23-2",
"122-121", "2-3-231-2"), class = "factor")), class = "data.frame", row.names = c(NA,
-3L))
b <- structure(list(id = structure(c(1L, 3L, 2L, 4L), .Label = c("1-23-2",
"12-1223-12", "122-121", "1221-12"), class = "factor")), class = "data.frame", row.names = c(NA,
-4L))

You may have several base R approaches to make it, e.g.,
a <- within(a,test <- ifelse(id %in% b$id,"yes","no"))
or
a <- within(a,test <- c("yes","no")[(!id%in% b$id) + 1])
or
a <- within(a,test <- c("yes","no")[is.na(match(id,b$id))+1])
such that
> a
id test
1 1-23-2 yes
2 2-3-231-2 no
3 122-121 yes
DATA
a <- data.frame(id = c("1-23-2", "2-3-231-2", "122-121"))
b <- data.frame(id = c("1-23-2", "122-121", "12-1223-12", "1221-12"))

Split Columns in a List of Dataframes R

I have a list of data frames which some columns have this special character ->(arrow). Now i do want to loop through this list of data frames and locate columns with this -> (arrow) then the new columns be named with a suffix _old and _new. This is a sample of data frames :
dput(df1)
df1 <- structure(list(v1 = c("reg->joy", "ress", "mer->dls"),
t2 = c("James","Jane", "Egg")),
class = "data.frame", row.names = c(NA, -3L))
dput(df2)
df2 <- structure(list(v1 = c("me", "df", "kl"),
t2 = c("James","Jane->dlt", "Egg"),
t3 = c("James ->may","Jane", "Egg")),
class = "data.frame", row.names = c(NA, -3L))
dput(df3)
df3 <- structure(list(v1 = c("56->34", "df23-> ", "mkl"),
t2 = c("James","Jane", "Egg"),
d3 = c("James->","Jane", "Egg")),
class = "data.frame", row.names = c(NA, -3L))
This is what I have tried
dfs <- list(df1,df2,df3)
for (y in 1:length(dfs)){
setDT(dfs[[y]])
df1<- lapply(names(dfs[[y]]), function(x) {
mDT <- df2[[y]][, tstrsplit(get(x), " *-> *")]
if (ncol(mDT) == 2L) setnames(mDT, paste0(x, c("_old", "_new")))
}) %>% as.data.table()
}
This only splits one data frame, I need to split all of the data frames.
NOTE: The code I have splits so well on one dataframe, what I want is how to implement it on a List of data frames
EXPECTED OUTPUT
dput(df1)
df1 <- structure(list(v1_old = c("reg", "mer"),
v1_new = c("joy", "dls")),
class = "data.frame", row.names = c(NA, -3L))
dput(df2)
df2 <- structure(list(t2_old = c("dlt"),
t2_new = c("dlt"),
t3_old = c("James"),
t3_new = c("may")),
class = "data.frame", row.names = c(NA, -3L))
dput(df3)
df3 <- structure(list(v1_old = c("56", "df23 "),
v1_new = c("34", " "),
d3 = c("James"),
d3 = c(" ")),
class = "data.frame", row.names = c(NA, -3L))

I add below a solution using the tidyverse.
Select the columns if one of the strings in the columns contains an arrow:
col_arrow_ls <- purrr::map(dfs, ~select_if(., ~any(str_detect(., "->"))))
Then split the function using tidyr::separate. Since each element of the output is a data frame, purrr::map_dfc is used to column-bind them together:
split_df_fn <- function(df1){
names(df1) %>%
map_dfc(~ df1 %>%
select(.x) %>%
tidyr::separate(.x,
into = paste0(.x, c("_old", "_new")),
sep = "->")
)
}
Apply the function to the list of data frames.
purrr::map(col_arrow_ls, split_df_fn)
[[1]]
v1_old v1_new
1 reg joy
2 ress <NA>
3 mer dls
[[2]]
t2_old t2_new t3_old t3_new
1 James <NA> James may
2 Jane dlt Jane <NA>
3 Egg <NA> Egg <NA>
[[3]]
v1_old v1_new d3_old d3_new
1 56 34 James
2 df23 Jane <NA>
3 mkl <NA> Egg <NA>

Filter rows based on one column from a list of dataframes

I have a list of multiple data frames and I would like to filter these data frames in a list by certain values in one column of each data frame. Each data frame in the list has a column called v1, which has special characters ++, ->, Now I do want to filter only rows having this arrow (->) in each data frame in a list. This is a sample of my dataframes,
dput(df)
df1 <- structure(list(v1 = c("->", "++", "->"),
t2 = c("James","Jane", "Egg")),
class = "data.frame", row.names = c(NA, -3L))
dput(df2)
df2 <- structure(list(v1 = c("++", "->", "->"),
t2 = c("James","Jane", "Egg")),
class = "data.frame", row.names = c(NA, -3L))
dput(df3)
df3 <- structure(list(v1 = c("++", "++", "->"),
t2 = c("James","Jane", "Egg"),
d3...c = c("James","Jane", "Egg")),
class = "data.frame", row.names = c(NA, -3L))
I have tried this but I am not getting the dataframes of filtered rows
idx = "->"
dfs <- list(df1,df2,df3)
lapply(dfs, function(x) x$v1 %in% idx)
someone help

idx <- "->"
# Base R
lapply(dfs, function(df) df[df$v1 == "->",])
lapply(dfs, function(df) df[df$v1 %in% idx,])
# tidyverse
library("purrr")
library("dplyr")
map(dfs, filter, v1 == "->")
map(dfs, filter, v1 %in% !! idx)

Try this:
idx <- "->"
fnct <- function(df){df <- df[df$v1 %in% idx, ]}
df1_idx <- fnct(df1)
df2_idx <- fnct(df2)
df3_idx <- fnct(df3)
dfs <- list(df1_idx, df2_idx, df3_idx)
dfs
Result:
[[1]]
v1 t2
1 -> James
3 -> Egg
[[2]]
v1 t2
2 -> Jane
3 -> Egg
[[3]]
v1 t2 d3...c
3 -> Egg Egg

Finding a Pattern in R

I am trying to clean some data. Below is an example of my data.
test1 test2 test3
jsb cjn kd N069W j N9DSW
I want to indicate what column has the pattern N0{num}{num}W in it. The {num} part can be any number between 0-9. This pattern can also appear anywhere in the string. Hence in this case my results would be as follows.
test1 test2 test3 col
jsb cjn kd N069W j N9DSW 2
Thanks in advance for any help.

We loop through the columns, use grepl to get a logical index and then with max.col get the column index of each row
max.col(data.frame(lapply(df1, grepl, pattern = "N0\\d{2}W")))
#[1] 2
data
df1 <- structure(list(test1 = "jsb cjn", test2 = "kd N069W j",
test3 = "N9DSW"), class = "data.frame", row.names = c(NA,
-1L))

you can also use the function str_detect() from the library stringr.
library(stringr)
str_detect('kd NO69W j', pattern = "NO\\d+W")
# [1] TRUE

Using apply:
df$col <- apply(df, 1, function(x) grep("N0\\d{2}W", x))
Data:
df <- structure(list(test1 = structure(1L, .Label = "jsb cjn", class = "factor"),
test2 = structure(1L, .Label = "kd N069W j", class = "factor"),
test3 = structure(1L, .Label = "N9DSW ", class = "factor")), class = "data.frame", row.names = c(NA,
-1L))

Equivalence between function(x) and purrr::map

I have this list:
list(structure(list(a = 1:10, b = 2:11, c = 3:12), .Names = c("a",
"b", "c"), row.names = c(NA, -10L), class = "data.frame"), structure(list(
a = 1:10, b = 2:11, c = 3:12), .Names = c("a", "b", "c"), row.names = c(NA,
-10L), class = "data.frame"), structure(list(a = 1:10, b = 2:11,
c = 3:12), .Names = c("a", "b", "c"), row.names = c(NA, -10L
), class = "data.frame"))
And this function:
fun1<-function(x){
funs<-c(s=sum,m=mean)
lapply(funs,function(f)f(x,na.rm=TRUE))
}
With lapply the result is ok. See:
list%>%
lapply(function(x){
lapply(x,fun1)
})
But, purrr::map doesn't work:
list%>%
map(.)%>%
map(.,fun1)
What's wrong?

Your syntax for the map part is wrong. You need the same code structure as you are using with lapply. First let's get rid of the pipes so the code looks more alike:
Also don't give objects the same name as R functions.
library(purrr)
lapply_outcome <- lapply(my_list, function(x) {lapply(x, fun1)})
map_outcome <- map(my_list, function(x) {map(x, fun1)})
identical(lapply_outcome, map_outcome)
[1] TRUE
With pipes:
my_list %>%
lapply(function(x) lapply(x,fun1))
my_list %>%
map(., function(x) map(x, fun1))
or with a formula call inside map, but personally I find this less readable:
my_list %>%
map(~ map(., fun1))