Remove columns when they include specific string in R - r

I would like to remove those columns that include LA in it. A sample dataset looks like:
testdata <- data.frame(id = c(1,2,3),
v1 = c("LA", "C","D"),
v2 = c("N","M","LA"),
v3 = c("D","E","T"))
> testdata
id v1 v2 v3
1 1 LA N D
2 2 C M E
3 3 D LA T
How can I remove v1 and v2 and get the desired dataset below?
> testdata
id v3
1 1 D
2 2 E
3 3 T

testdata%>%
select(-which(sapply(., function(x) any(x=="LA"))))
id v3
1 1 D
2 2 E
3 3 T

Using sapply or vapply you could do:
testdata[vapply(testdata, function(x) !any(grepl("LA", x)), FUN.VALUE = logical(1))]
#> id v3
#> 1 1 D
#> 2 2 E
#> 3 3 T
testdata[sapply(testdata, function(x) !any(grepl("LA", x)))]
#> id v3
#> 1 1 D
#> 2 2 E
#> 3 3 T
Or using dplyr:
library(dplyr)
testdata %>%
select(where(~!any(grepl("LA", .x))))
#> id v3
#> 1 1 D
#> 2 2 E
#> 3 3 T

Using discard
library(purrr)
discard(testdata, ~ "LA" %in% .x)
id v3
1 1 D
2 2 E
3 3 T

Related

Doing data mapping for each value in a df

I have a data set like this
df1<-data.frame(ID=c(1,2,3,4),colA=c(101,102,103,104),colB=c(201,202,203,204))
df2<-data.frame(var_id=c(101,102,103,104,201,202,203,204),var_value=c("A","B","C","D","E","F","G","H"))
I want to map any value in df1 that is in df2$var_id with the corresponding string in df2$var_value.
Desired output
df1<-data.frame(ID=c(1,2,3,4),colA=c("A","B","C","D"),colB=c("E","F","G","H"))
I have tried write a function, and then do lapply, but it only display one var_value
cols = c("colA", "colB")
df1[cols] <- lapply(df1[cols], \(x) df2$var_value[match(x, df2$var_id)])
df1
# ID colA colB
# 1 1 A E
# 2 2 B F
# 3 3 C G
# 4 4 D H
You can join twice.
library(dplyr)
df1 %>%
left_join(df2, by = c("colA" = "var_id")) %>%
left_join(df2, by = c("colB" = "var_id")) %>%
select(ID, colA = var_value.x, colB = var_value.y)
# ID colA colB
# 1 1 A E
# 2 2 B F
# 3 3 C G
# 4 4 D H
with the tidyverse you can apply a function across several columns:
library(tidyverse)
df1 |>
mutate(across(colA:colB, \(x) map_chr(x, \(y) with(df2,var_value[y == var_id] ))))
#> ID colA colB
#> 1 1 A E
#> 2 2 B F
#> 3 3 C G
#> 4 4 D H
#or
df1 |>
mutate(across(colA:colB, \(x) with(df2, var_value[match(x, var_id)])))
#> ID colA colB
#> 1 1 A E
#> 2 2 B F
#> 3 3 C G
#> 4 4 D H

data frame de duplication

I have a data frame. You can see that some rows just differs in the order "A"-"B" and "B"-"A" and these two rows have the same Value
df <- tibble(
V1 = c("A", "C", "B","D"),
V2 = c("B", "D", "A","C"),
Value = c(1,2,1,2)
)
V1 V2 Value
<chr> <chr> <dbl>
1 A B 1
2 C D 2
3 B A 1
4 D C 2
I want to remove one duplicated rows 0 or 2, to make it like below
V1 V2 Value
0 A B 1
1 C D 2
How can I remove those repetitive rows?
df[!duplicated(t(apply(df,1,sort))),]
V1 V2 Value
0 A B 1
1 C D 2
or even:
df[!duplicated(cbind(pmax(df$V1, df$V2), pmin(df$V1, df$V2))),]
V1 V2 Value
0 A B 1
1 C D 2
An option with tidyverse
library(dplyr)
library(stringr)
library(purrr)
df %>%
filter(!duplicated(pmap_chr(across(V1:V2), ~ str_c(sort(c(...)),
collapse = ""))))
# A tibble: 2 × 3
V1 V2 Value
<chr> <chr> <dbl>
1 A B 1
2 C D 2

Replacement of column values based on a named vector

Consider the following named vector vec and tibble df:
vec <- c("1" = "a", "2" = "b", "3" = "c")
df <- tibble(col = rep(1:3, c(4, 2, 5)))
df
# # A tibble: 11 x 1
# col
# <int>
# 1 1
# 2 1
# 3 1
# 4 1
# 5 2
# 6 2
# 7 3
# 8 3
# 9 3
# 10 3
# 11 3
I would like to replace the values in the col column with the corresponding named values in vec.
I'm looking for a tidyverse approach, that doesn't involve converting vec as a tibble.
I tried the following, without success:
df %>%
mutate(col = map(
vec,
~ str_replace(col, names(.x), .x)
))
Expected output:
# A tibble: 11 x 1
col
<chr>
1 a
2 a
3 a
4 a
5 b
6 b
7 c
8 c
9 c
10 c
11 c
You could use col :
df$col1 <- vec[as.character(df$col)]
Or in mutate :
library(dplyr)
df %>% mutate(col1 = vec[as.character(col)])
# col col1
# <int> <chr>
# 1 1 a
# 2 1 a
# 3 1 a
# 4 1 a
# 5 2 b
# 6 2 b
# 7 3 c
# 8 3 c
# 9 3 c
#10 3 c
#11 3 c
We can also use data.table
library(data.table)
setDT(df)[, col1 := vec[as.character(col)]]

how to split a string column in R based on equal length and get them in different rows

library(tidyr)
library(dplyr)
mydf
V1 V2
2 1 abcdef
3 2 abcd
4 3 bghj
5 4 kl
6 5 uilm
I want to get my data frame that in result V2 column should be separated in the length of 2 in separate rows
V1 V2
1 1 ab
2 1 cd
3 1 ef
4 2 ab
5 2 cd
6 3 bg
7 3 hj
8 4 kl
9 5 ui
10 5 lm
Here is a base R option splitting the string every 2 characters -
mydf <- data.frame(V1 = 1:5, V2 = c('abcdef', 'abcd', 'bghj', 'kl', 'ulim'))
tmp <- strsplit(mydf$V2, '(?<=..)', perl = TRUE)
result <- mydf[rep(1:nrow(mydf), lengths(tmp)), ]
result$V2 <- unlist(tmp)
rownames(result) <- NULL
result
# V1 V2
#1 1 ab
#2 1 cd
#3 1 ef
#4 2 ab
#5 2 cd
#6 3 bg
#7 3 hj
#8 4 kl
#9 5 ul
#10 5 im
Another tidyverse approach. Basically it adds some extra character like # which may not be present elsewhere and then use tidyr::separate_rows
library(tidyverse)
df %>% mutate(V2 = map_chr(strsplit(V2, '(?<=..)', perl = T), ~paste(.x, collapse = '#'))) %>%
separate_rows(V2)
#> # A tibble: 10 x 2
#> V1 V2
#> <int> <chr>
#> 1 1 ab
#> 2 1 cd
#> 3 1 ef
#> 4 2 ab
#> 5 2 cd
#> 6 3 bg
#> 7 3 hj
#> 8 4 kl
#> 9 5 ul
#> 10 5 im
Created on 2021-06-04 by the reprex package (v2.0.0)
You can also use the following solution:
library(dplyr)
library(tidyr)
library(stringr)
df %>%
rowwise() %>%
mutate(V2 = list(str_sub(V2, seq(1, nchar(V2)-1, 2), seq(2, nchar(V2), 2)))) %>%
unnest_longer(col = V2)
# A tibble: 10 x 2
V1 V2
<int> <chr>
1 1 ab
2 1 cd
3 1 ef
4 2 ab
5 2 cd
6 3 bg
7 3 hj
8 4 kl
9 5 ui
10 5 lm
You can define a function to sub-string every other character and apply it row wise on V2 to create a nested column of character vector. Then, unnest the column.
library(tidyverse)
mydf <- read.table(
text = "
V1 V2
1 abcdef
2 abcd
3 bghj
4 kl
5 uilm",
header = TRUE
)
get_string <- function(str) {
n <- seq(1, nchar(str), 2)
map_chr(n, ~ str_sub(str, ., . + 1))
}
mydf %>%
rowwise() %>%
mutate(V2 = list(get_string(V2))) %>%
ungroup() %>%
unnest(V2)
# # A tibble: 10 x 2
# V1 V2
# <int> <chr>
# 1 1 ab
# 2 1 cd
# 3 1 ef
# 4 2 ab
# 5 2 cd
# 6 3 bg
# 7 3 hj
# 8 4 kl
# 9 5 ui
# 10 5 lm

Add together 2 dataframes in R without losing columns

I have 2 dataframes in R (df1, df2).
A C D
1 1 1
2 2 2
df2 as
A B C
1 1 1
2 2 2
How can I merge these 2 dataframes to produce the following output?
A B C D
2 1 2 1
4 2 4 2
Columns are sorted and column values are added. Both DFs have same number of rows. Thank you in advance.
Code to create DF:
df1 <- data.frame("A" = 1:2, "C" = 1:2, "D" = 1:2)
df2 <- data.frame("A" = 1:2, "B" = 1:2, "C" = 1:2)
nm1 = names(df1)
nm2 = names(df2)
nm = intersect(nm1, nm2)
if (length(nm) == 0){ # if no column names in common
cbind(df1, df2)
} else { # if column names in common
cbind(df1[!nm1 %in% nm2], # columns only in df1
df1[nm] + df2[nm], # add columns common to both
df2[!nm2 %in% nm1]) # columns only in df2
}
# D A C B
#1 1 2 2 1
#2 2 4 4 2
You can try:
library(tidyverse)
list(df2, df1) %>%
map(rownames_to_column) %>%
bind_rows %>%
group_by(rowname) %>%
summarise_all(sum, na.rm = TRUE)
# A tibble: 2 x 5
rowname A B C D
<chr> <int> <int> <int> <int>
1 1 2 1 2 1
2 2 4 2 4 2
By using left_join() from dplyr you won't lose the column
library(tidyverse)
dat1 <- tibble(a = 1:10,
b = 1:10,
c = 1:10)
dat2 <- tibble(c = 1:10,
d = 1:10,
e = 1:10)
left_join(dat1, dat2, by = "c")
#> # A tibble: 10 x 5
#> a b c d e
#> <int> <int> <int> <int> <int>
#> 1 1 1 1 1 1
#> 2 2 2 2 2 2
#> 3 3 3 3 3 3
#> 4 4 4 4 4 4
#> 5 5 5 5 5 5
#> 6 6 6 6 6 6
#> 7 7 7 7 7 7
#> 8 8 8 8 8 8
#> 9 9 9 9 9 9
#> 10 10 10 10 10 10
Created on 2019-01-16 by the reprex package (v0.2.1)
allnames <- sort(unique(c(names(df1), names(df2))))
df3 <- data.frame(matrix(0, nrow = nrow(df1), ncol = length(allnames)))
names(df3) <- allnames
df3[,allnames %in% names(df1)] <- df3[,allnames %in% names(df1)] + df1
df3[,allnames %in% names(df2)] <- df3[,allnames %in% names(df2)] + df2
df3
A B C D
1 2 1 2 1
2 4 2 4 2
Here is a fun base R method with Reduce.
Reduce(cbind,
list(Reduce("+", list(df1[intersect(names(df1), names(df2))],
df2[intersect(names(df1), names(df2))])), # sum results
df1[setdiff(names(df1), names(df2))], # in df1, not df2
df2[setdiff(names(df2), names(df1))])) # in df2, not df1
This returns
A C D B
1 2 2 1 1
2 4 4 2 2
This assumes that both df1 and df2 have columns that are not present in the other. If this is not true, you'd have to adjust the list.
Note also that you could replace Reduce with do.call in both places and you'd get the same result.

Resources