How to remove all the variable that named as .x, .y? - r

I have a list of data.frame (lst1). In each data.frame in lst1, we have some variables that looks like test.x, test.y, try.x, try.y. etc.
I want to filter out those variables that were created by merging dataset without filter out those variable first (try, test, etc.). How should I filter them out now?
Thanks.

You can also try this:
#Data
List <- list(A=data.frame(a=1,b=5,test.x=NA,test.y=5),
B=data.frame(a=5,b=6,test.x=NA,try.x=7))
#Remove
myfun <- function(x)
{
i <- which(grepl('.x|.y',names(x)))
x <- x[,-i]
return(x)
}
#Apply
List2 <- lapply(List,myfun)
Output:
List2
$A
a b
1 1 5
$B
a b
1 5 6

Here's a tidyverse approach:
We can use the dplyr::select function to select only the columns we want. matches() allows us to select columns using regular expressions. \\.[xy]$ matches columns that contain a period followed by x or y and $ anchors the match to the end of the string.
The purrr::map function allows us to apply the selection to each list element. ~ defines a formula which is automatically converted to a function.
library(tidyverse)
lst2 <- lst1 %>%
map(~dplyr::select(.,-matches("\\.[xy]$")))
map(lst2, head, 2)
#[[1]]
# ID name
#1 1 A
#2 2 B
#[[2]]
# ID name
#1 1 A
#2 2 B
#[[3]]
# ID name
#1 1 A
#2 2 B
#[[4]]
# ID name
#1 1 A
#2 2 B
#[[5]]
# ID name
#1 1 A
#2 2 B
Sample Data:
lst1 <- replicate(5,data.frame(ID = 1:15, name = LETTERS[1:15], test.x = runif(15), test.y = runif(15)),simplify = FALSE)
map(lst1, head, 2)
#[[1]]
# ID name test.x test.y
#1 1 A 0.03772391 0.2630905
#2 2 B 0.11844048 0.2929392
#[[2]]
# ID name test.x test.y
#1 1 A 0.398029 0.5151159
#2 2 B 0.348489 0.9534869
#[[3]]
# ID name test.x test.y
#1 1 A 0.7447383 0.6862136
#2 2 B 0.3623562 0.7542699
#
#[[4]]
# ID name test.x test.y
#1 1 A 0.9341495 0.8660333
#2 2 B 0.8383039 0.6299427
#[[5]]
# ID name test.x test.y
#1 1 A 0.02662444 0.04502225
#2 2 B 0.29855214 0.46189116

In base R, we can use endsWith
lapply(List, function(x) x[!(endsWith(names(x),
'.x')|endsWith(names(x), '.y'))])
-output
#$A
# a b
#1 1 5
#$B
# a b
#1 5 6
data
List <- list(A = structure(list(a = 1, b = 5, test.x = NA, test.y = 5), class = "data.frame", row.names = c(NA,
-1L)), B = structure(list(a = 5, b = 6, test.x = NA, try.x = 7), class = "data.frame", row.names = c(NA,
-1L)))

Related

Change the column names of a list of dataframes in R [duplicate]

This question already has answers here:
Changing Column Names in a List of Data Frames in R
(6 answers)
Rename Columns of dataframe based on names of list in R
(2 answers)
Closed 2 years ago.
I have a list of dataframes in this form.
d1 <- data.frame(i = c("a","b","c"), var = 1:3, stringsAsFactors=FALSE)
d2 <- data.frame(i = c("b","c","d"), var = 5:8, stringsAsFactors=FALSE)
d3 <- data.frame(i = c("c","d","a"), var = 2:4, stringsAsFactors=FALSE)
dfList <- list(d1,d2,d3)
I want to change the var variables to var_d1, var_d2, var_d3 respectively to do a full-join later. How do I implement this? How do I retrive the name of the data frames and make them into strings?
Start with naming the list
names(dfList) <- paste0('d', seq_along(dfList))
Once you do that you can use Map to rename columns :
Map(function(x, y) {names(x)[-1] <- paste(names(x)[-1], y, sep = "_");x},
dfList, names(dfList))
#$d1
# i var_d1
#1 a 1
#2 b 2
#3 c 3
#$d2
# i var_d2
#1 b 5
#2 c 6
#3 d 7
#$d3
# i var_d3
#1 c 2
#2 d 3
#3 a 4
Or in tidyverse :
library(dplyr)
library(purrr)
imap(dfList, function(x, y) x %>% rename_with(~paste(., y, sep = "_"), -1))
dfList <- mget(paste0("d", 1:3))
mapply(function(df, name) {
names(df)[names(df) == "var"] <- paste0("var_", name)
df
}, dfList, names(dfList), SIMPLIFY = FALSE)
#> $d1
#> i var_d1
#> 1 a 1
#> 2 b 2
#> 3 c 3
#>
#> $d2
#> i var_d2
#> 1 b 5
#> 2 c 6
#> 3 d 7
#>
#> $d3
#> i var_d3
#> 1 c 2
#> 2 d 3
#> 3 a 4
To changes the variables and then save them in a list of strings you can do something like this.
(I think you made a mistake in d2 so I changed it)
d1 <- data.frame(i = c("a","b","c"), var = 1:3, stringsAsFactors=FALSE)
d2 <- data.frame(i = c("b","c","d"), var = 5:7, stringsAsFactors=FALSE)
d3 <- data.frame(i = c("c","d","a"), var = 2:4, stringsAsFactors=FALSE)
dfList <- list(d1,d2,d3)
column_names <- list()
for (i in 1:length(dfList)){
colnames(dfList[[i]]) <- c("i",paste0("var_d",i))
column_names[[i]] <- names(dfList[[i]])
}
# they are stored here
column_names
[[1]]
[1] "i" "var_d1"
[[2]]
[1] "i" "var_d2"
[[3]]
[1] "i" "var_d3"
Maybe we can try the code below
> Map(function(k) setNames(dfList[[k]],c("i",paste0("var_d",k))),seq_along(dfList))
[[1]]
i var_d1
1 a 1
2 b 2
3 c 3
[[2]]
i var_d2
1 b 6
2 c 7
3 d 8
[[3]]
i var_d3
1 c 2
2 d 3
3 a 4
An approach quite similar to the ones proposed using Map, that uses lapply instead:
dfList <- lapply(
1:length(dfList),
function(x) setNames(dfList[[x]],
c('i', paste0('var_d', x))
)
)

create dummy variable based on values of another variable?

I have a large dataset with multiple columns of the following structure
A B
1. 1. D1
2. 1. D2
3. 2 D2
4. 3. D1
5. 3. D2
I'm trying to create a new data frame based on unique observations in column A, with a dummy variable "Dummy" coded as 1=D1, 2=D2, 3=both, like so:
A. Dummy
1. 1. 3
2. 2. 2
3. 3. 3
Any idea how I can go about this?
You can use aggregate.
aggregate(B ~ A, df, function(x) if(all(x == "D1")) 1 else if(all(x == "D2")) 2 else 3)
# A B
# 1 1 3
# 2 2 2
# 3 3 3
Another possible solution:
df %>%
group_by(A) %>%
summarise(B = paste0(B, collapse = "_")) %>%
mutate(Dummy = case_when(
B == "D1" ~ 1,
B == "D2" ~ 2,
B == "D1_D2" | B == "D2_D1" ~ 3,
TRUE ~ NA_real_
)) %>%
select(-B)
Result
# A tibble: 3 x 2
A Dummy
<dbl> <dbl>
1 1 3
2 2 2
3 3 3
Here is an option with dplyr. After grouping by 'A', if the number of distinct elements are greater than 1, return 3 or else use a named vector to match the first element of 'B'
library(dplyr)
df1 %>%
group_by(A) %>%
summarise(Dummy = if(n_distinct(B) > 1) 3L else
setNames(1:2, c("D1", "D2"))[first(B)])
# A tibble: 3 x 2
# A Dummy
#* <dbl> <int>
#1 1 3
#2 2 2
#3 3 3
data
df1 <- structure(list(A = c(1, 1, 2, 3, 3), B = c("D1", "D2", "D2",
"D1", "D2")), class = "data.frame", row.names = c("1.", "2.",
"3.", "4.", "5."))

Including map() function to tabulate each element in a character vector returns an error

I'd like to tabulate the frequencies of each unitary element in a character vector. This is vector contains the answers to a set of items in a survey, with this structure "ADCDAB...", being "A" the answer to the first item, "D" to the second one, etc.
I'd like to process the data with purrr::map combined with base string functions.
p1 <- strsplit(substr(test$answer),"")
map(p1,table)
However, if I include the code with dplyr, the systems returns an error message:
test %>%
mutate(p1=strsplit(answer,"")) %>%
map(p1,table)
the system returns the following error message:
Error: Index 1 must have length 1, not 10
What's wrong with the second syntax?
A dummy dataset
structure(list(answer = c(".BBCBD.A.D", "...DB..AA.", "B......AB.",
"BDDDBACADD", "BB.ABC.AAD"), d.n.i = c(1, 2, 3, 4, 5)), row.names = c(NA,
5L), class = "data.frame")
Here is a base R option
x <- "ADCDAB"
out <- table(utf8ToInt(x))
names(out) <- intToUtf8(names(out), multiple = TRUE)
out
#A B C D
#2 1 1 2
With multiple elements use lapply
x <- c("ADCDAB", "EFG")
f <- function(i) {
out <- table(utf8ToInt(i))
names(out) <- intToUtf8(names(out), multiple = TRUE)
out
}
lapply(x, f)
Returns
#[[1]]
#A B C D
#2 1 1 2
#[[2]]
#E F G
#1 1 1
If you need output as single table, try
x <- c("ADCDAB", "EFGAA")
f(paste(x, collapse = ""))
#A B C D E F G
#4 1 1 2 1 1 1
.. or as dataframe
as.data.frame(f(paste(x, collapse = "")))
# Var1 Freq
#1 A 4
#2 B 1
#3 C 1
#4 D 2
#5 E 1
#6 F 1
#7 G 1
You could do :
library(tidyverse)
test %>% mutate(p1 = strsplit(answer,""), p2 = map(p1, table))
However, I would suggest something like below :
test %>%
mutate(p1 = strsplit(answer,"")) %>%
unnest(p1) %>%
count(answer, p1)
# answer p1 n
# <chr> <chr> <int>
#1 ABCD A 1
#2 ABCD B 1
#3 ABCD C 1
#4 ABCD D 1
#5 ADCDAB A 2
#6 ADCDAB B 1
#7 ADCDAB C 1
#8 ADCDAB D 2
data
test <- data.frame(answer = c("ADCDAB", "ABCD"), stringsAsFactors = FALSE)

Dynamically select all columns but among ones that start with a certain word exclude all but keep one

I have many data frames that come in such a format:
df1 <- structure(list(ID = 1:2, Name = 1:2, Gender = 1:2, Group = 1:2,
FORMULA_RULE = 1:2, FORMULA_TRANSFORM = 1:2, FORMULA_UNITE = 1:2,
FORMULA_CALCULATE = 1:2, FORMULA_JOIN = 1:2), class = "data.frame", row.names = c(NA,
-2L))
df2 <- structure(list(ID = 1:2, Name = 1:2, Gender = 1:2, FORMULA_RULE = 1:2,
FORMULA_META = c(NA, NA), FORMULA_DATA = 1:2, FORMULA_JOIN = 1:2,
FORMULA_TRANSFORM = 1:2, Group = 1:2), class = "data.frame", row.names = c(NA,
-2L))
View:
df1
ID Name Gender Group FORMULA_RULE FORMULA_TRANSFORM FORMULA_UNITE FORMULA_CALCULATE FORMULA_JOIN
1 1 1 1 1 1 1 1 1 1
2 2 2 2 2 2 2 2 2 2
df2
ID Name Gender FORMULA_RULE FORMULA_META FORMULA_DATA FORMULA_JOIN FORMULA_TRANSFORM Group
1 1 1 1 1 NA 1 1 1 1
2 2 2 2 2 NA 2 2 2 2
I want to write a code that would work on all such dataframes in a way that all columns are kept, but among the columns starts with FORMULA_, only FORMULA_TRANSFORM is selected. Please note that columns that do NOT start with FORMULA_ are not always the same, that is to say, I cannot simply write a code that always selects ID, Name, Gender, Group, and FORMULA_TRANSFORM, because there are some data frames that contain many other columns that do not start with FORMULA_ which I want to keep.
My attempt to solve this problem is this ugly code which works as expected:
library(tidyverse)
for(i in 1:length(ls(pattern = "df"))){
get(paste0("df", i)) %>%
select(-starts_with("FORMULA"),
(names(get(paste0("df", i))) %>% grep(pattern = "FORMULA", value = T))[!names(get(paste0("df", i))) %>% grep(pattern = "FORMULA", value = T) %in% "FORMULA_TRANSFORM"])
%>% print
}
Is there a more straight-forward way to do this?
With dplyr we can use select and it's pretty straight forward using starts_with and contains.
library(dplyr)
df1 %>%
select(-starts_with("FORMULA_"), contains("FORMULA_TRANSFORM"))
# ID Name Gender Group FORMULA_TRANSFORM
#1 1 1 1 1 1
#2 2 2 2 2 2
Let's try with a dataframe without "FORMULA_TRANSFORM" column
df3 <- df1
df3$FORMULA_TRANSFORM <- NULL
df3 %>%
select(-starts_with("FORMULA_"), contains("FORMULA_TRANSFORM"))
# ID Name Gender Group
#1 1 1 1 1
#2 2 2 2 2
With minus sign we are removing the columns that starts_with "FORMULA_" and selecting the one with "FORMULA_TRANSFORM". Instead of contains we can also use one_of() or matches() and it would still work.
Using base R we can use grep with invert and value set as TRUE
df1[c(grep("^FORMULA_", names(df1), invert = TRUE, value = TRUE),
"FORMULA_TRANSFORM")]
# ID Name Gender Group FORMULA_TRANSFORM
#1 1 1 1 1 1
#2 2 2 2 2 2
This creates a vector of column names where column name doesn't start with "FORMULA_" and we add "FORMULA_TRANSFORM" manually later.
The above method assumes that you always have "FORMULA_TRANSFORM" column in your dataframe and it will fail if there isn't. Safer option would be
get_selected_cols <- function(df1) {
cbind(df1[grep("^FORMULA_", names(df1), invert = TRUE)],
df1[names(df1) == "FORMULA_TRANSFORM"])
}
get_selected_cols(df1)
# ID Name Gender Group FORMULA_TRANSFORM
#1 1 1 1 1 1
#2 2 2 2 2 2
get_selected_cols(df3)
# ID Name Gender Group
#1 1 1 1 1
#2 2 2 2 2

Working with dataframes in a list: Drop variables, add new ones

Define a list dats with two dataframes, df1 and df2
dats <- list( df1 = data.frame(a=sample(1:3), b = sample(11:13)),
df2 = data.frame(a=sample(1:3), b = sample(11:13)))
> dats
$df1
a b
1 2 12
2 3 11
3 1 13
$df2
a b
1 3 13
2 2 11
3 1 12
I would like to drop variable a in each data frame. Next I would like to add a variable with the id of each dataframe from an external dataframe, like:
ids <- data.frame(id=c("id1","id2"),df=c("df1","df2"))
> ids
id df
1 id1 df1
2 id2 df2
To drop unnecessary vars I tried this without luck:
> dats <- lapply(dats, function(x) assign(x, x[,c("b")]))
> Error in assign(x, x[, c("b")]) : invalid first argument
Not sure how to add the id either.
I also tried, perhaps more appropriately:
> temp <- lapply(dats, function(x) subset(x[1], select=x[[1]]$b))
Error in x[[1]]$b : $ operator is invalid for atomic vectors
What I find confusing is that str(out[1]) returns a list, str(out[[1]]) returns a dataframe. I think that may have something to do with it.
Or try this: Extract your ids into a named vector that maps the data-frame name to the id:
df2id <- ids$id
names(df2id) <- ids$df
> df2id
df1 df2
id1 id2
Levels: id1 id2
Then use mapply to both (a) drop the a column from each data-frame, and (b) add the id column:
> mapply( function(d,x) cbind( subset(d, select = -a),
+ id = x),
+ dats, df2id[ names(dats) ] ,
+ SIMPLIFY=FALSE)
$df1
b id
1 12 id1
2 11 id1
3 13 id1
$df2
b id
1 12 id2
2 11 id2
3 13 id2
Note that we are passing df2id[ names(dats) ] to the mapply -- this ensures that the data-frames in df2id are "aligned" with the data-frames in dats.
Is this OK?
dats <- list( df1 = data.frame(a=sample(1:3), b = sample(11:13)),
df2 = data.frame(a=sample(1:3), b = sample(11:13)))
ids <- data.frame(id=c("id1","id2"),df=c("df1","df2"))
# remove variable a
dats2 <- lapply(dats, function(x) x[,!names(x) == "a"])
# add id
for(i in 1:length(dats2)) {
dats2[[i]] <- merge(dats2[[i]], ids$id[ids$df == names(dats2)[i]])
}
dats2
$df1
x y
1 11 id1
2 12 id1
3 13 id1
$df2
x y
1 11 id2
2 12 id2
3 13 id2

Resources