data frame de duplication - r

I have a data frame. You can see that some rows just differs in the order "A"-"B" and "B"-"A" and these two rows have the same Value
df <- tibble(
V1 = c("A", "C", "B","D"),
V2 = c("B", "D", "A","C"),
Value = c(1,2,1,2)
)
V1 V2 Value
<chr> <chr> <dbl>
1 A B 1
2 C D 2
3 B A 1
4 D C 2
I want to remove one duplicated rows 0 or 2, to make it like below
V1 V2 Value
0 A B 1
1 C D 2
How can I remove those repetitive rows?

df[!duplicated(t(apply(df,1,sort))),]
V1 V2 Value
0 A B 1
1 C D 2
or even:
df[!duplicated(cbind(pmax(df$V1, df$V2), pmin(df$V1, df$V2))),]
V1 V2 Value
0 A B 1
1 C D 2

An option with tidyverse
library(dplyr)
library(stringr)
library(purrr)
df %>%
filter(!duplicated(pmap_chr(across(V1:V2), ~ str_c(sort(c(...)),
collapse = ""))))
# A tibble: 2 × 3
V1 V2 Value
<chr> <chr> <dbl>
1 A B 1
2 C D 2

Related

Remove columns when they include specific string in R

I would like to remove those columns that include LA in it. A sample dataset looks like:
testdata <- data.frame(id = c(1,2,3),
v1 = c("LA", "C","D"),
v2 = c("N","M","LA"),
v3 = c("D","E","T"))
> testdata
id v1 v2 v3
1 1 LA N D
2 2 C M E
3 3 D LA T
How can I remove v1 and v2 and get the desired dataset below?
> testdata
id v3
1 1 D
2 2 E
3 3 T
testdata%>%
select(-which(sapply(., function(x) any(x=="LA"))))
id v3
1 1 D
2 2 E
3 3 T
Using sapply or vapply you could do:
testdata[vapply(testdata, function(x) !any(grepl("LA", x)), FUN.VALUE = logical(1))]
#> id v3
#> 1 1 D
#> 2 2 E
#> 3 3 T
testdata[sapply(testdata, function(x) !any(grepl("LA", x)))]
#> id v3
#> 1 1 D
#> 2 2 E
#> 3 3 T
Or using dplyr:
library(dplyr)
testdata %>%
select(where(~!any(grepl("LA", .x))))
#> id v3
#> 1 1 D
#> 2 2 E
#> 3 3 T
Using discard
library(purrr)
discard(testdata, ~ "LA" %in% .x)
id v3
1 1 D
2 2 E
3 3 T

Collapse several columns of data frame into one data frame

For some reason, I have a data in which a few columns are a set of data frame consist of one column. So, I want to "collapse" these columns of data frame into one data frame.
library(tidyverse)
df <- tibble(col1=1:5,
col2=tibble(newcol=LETTERS[1:5]),
col3=tibble(newcol2=LETTERS[6:10]))
df
# A tibble: 5 x 3
col1 col2$newcol col3$newcol2
<int> <chr> <chr>
1 1 A F
2 2 B G
3 3 C H
4 4 D I
5 5 E J
I have tried unnest(), but, the function actually replicate data frame/tibble of col2 and col3 for each row of col1, which is not what I want.
df2 <- df %>% unnest(cols = c(col2, col3))
df2
# A tibble: 25 x 3
col1 col2 col3
<int> <chr> <chr>
1 1 A F
2 1 B G
3 1 C H
4 1 D I
5 1 E J
6 2 A F
7 2 B G
8 2 C H
9 2 D I
10 2 E J
# ... with 15 more rows
The result that I want is as below:
df3 <- tibble(col1=1:5,
newcol=LETTERS[1:5],
newcol2=LETTERS[6:10])
df3
# A tibble: 5 x 3
col1 newcol newcol2
<int> <chr> <chr>
1 1 A F
2 2 B G
3 3 C H
4 4 D I
5 5 E J
Any idea how to do this? Any help is much appreciated.
it looks like you only want to change the column names or am I missing something here?
df<-df%>%mutate(col2=df$col2$newcol, col3=df$col3$newcol2)
After your comment, here you can find a more general version (might not be suitable for all use cases)
df1<-df%>%unnest(cols = c(1:3))%>%
group_by(col1)%>%
mutate(row=row_number())%>%
filter(row==col1)%>%
select(-row)
If I understand correct you have three dataframes each of them containing one column. Now you want to bring them all in one dataframe together. Then cbind is an option.
df3 <- cbind(df, col2, col3)
Output:
col1 newcol newcol2
1 1 A F
2 2 B G
3 3 C H
4 4 D I
5 5 E J

dplyr mutate: create column using first occurrence of another column

I was wondering if there's a more elegant way of taking a dataframe, grouping by x to see how many x's occur in the dataset, then mutating to find the first occurrence of every x (y)
test <- data.frame(x = c("a", "b", "c", "d",
"c", "b", "e", "f", "g"),
y = c(1,1,1,1,2,2,2,2,2))
x y
1 a 1
2 b 1
3 c 1
4 d 1
5 c 2
6 b 2
7 e 2
8 f 2
9 g 2
Current Output
output <- test %>%
group_by(x) %>%
summarise(count = n())
x count
<fct> <int>
1 a 1
2 b 2
3 c 2
4 d 1
5 e 1
6 f 1
7 g 1
Desired Output
x count first_seen
<fct> <int> <dbl>
1 a 1 1
2 b 2 1
3 c 2 1
4 d 1 1
5 e 1 2
6 f 1 2
7 g 1 2
I can filter the test dataframe for the first occurrences then use a left_join but was hoping there's a more elegant solution using mutate?
# filter for first occurrences of y
right <- test %>%
group_by(x) %>%
filter(y == min(y)) %>%
slice(1) %>%
ungroup()
# bind to the output dataframe
left_join(output, right, by = "x")
We can use first after grouping by 'x' to create a new column, use that also in group_by and get the count with n()
library(dplyr)
test %>%
group_by(x) %>%
group_by(first_seen = first(y), add = TRUE) %>%
summarise(count = n())
# A tibble: 7 x 3
# Groups: x [7]
# x first_seen count
# <fct> <dbl> <int>
#1 a 1 1
#2 b 1 2
#3 c 1 2
#4 d 1 1
#5 e 2 1
#6 f 2 1
#7 g 2 1
I have a question. Why not keep it simple? for example
test %>%
group_by(x) %>%
summarise(
count = n(),
first_seen = first(y)
)
#> # A tibble: 7 x 3
#> x count first_seen
#> <chr> <int> <dbl>
#> 1 a 1 1
#> 2 b 2 1
#> 3 c 2 1
#> 4 d 1 1
#> 5 e 1 2
#> 6 f 1 2
#> 7 g 1 2

how many distinct variables are represented in each group?

I have a data frame such as this:
df <- data.frame(
ID = c('123','124','125','126'),
Group = c('A', 'A', 'B', 'B'),
V1 = c(1,2,1,0),
V2 = c(0,0,1,0),
V3 = c(1,1,0,3))
which returns:
ID Group V1 V2 V3
1 123 A 1 0 1
2 124 A 2 0 1
3 125 B 1 1 0
4 126 B 0 0 3
and I would like to return a table that indicates if a variable is represented in the group or not:
Group V1 V2 V3
A 1 0 1
B 1 1 1
In order to count the number of distinct variables in each group.
Using:
df %>%
group_by(Group) %>%
summarise_at(vars(V1:V3), funs(as.integer(any(. > 0))))
gives:
# A tibble: 2 × 4
Group V1 V2 V3
<fctr> <dbl> <dbl> <dbl>
1 A 1 0 1
2 B 1 1 1
Can be done in data.table:
require(data.table)
setDT(df)
table <- df[, .(sum(V1) > 0, sum(V2) > 0, sum(V3) > 0), Group]
table
Group V1 V2 V3
1: A TRUE FALSE TRUE
2: B TRUE TRUE TRUE
table[, lapply(.SD, as.integer), Group, .SD=2:4]
Group V1 V2 V3
1: A 1 0 1
2: B 1 1 1

In R: sort data frame with non-unique values using custom vectors

If given this data frame:
df <-
data.frame(
v1=c("a","b","c","a"),
v2=c("z", "x", "x", "y"),
v3=c(1,2,3,4),
v4=factor(c("id1", "id2", "id3", "id4")))
> df
v1 v2 v3 v4
1 a z 1 id1
2 b x 2 id2
3 c x 3 id3
4 a y 4 id4
You can sort the data after v1 with
df[order(df$v1),]
v1 v2 v3 v4
1 a z 1 id1
4 a y 4 id4
2 b x 2 id2
3 c x 3 id3
And by a primary (v1) and secondary (v2) (to settle ties):
df[order(df$v1, df$v2),]
v1 v2 v3 v4
4 a y 4 id4
1 a z 1 id1
2 b x 2 id2
3 c x 3 id3
If you want to sort by a custom - not alphabetically or numerically - vector s I only know to use match() and this only works when theres only unique values:
require(dplyr)
s <- c("b","c","a")
df %>%
distinct(v1) %>%
.[match(s,df$v1),]
v1 v2 v3 v4
2 b x 2 id2
3 c x 3 id3
1 a z 1 id1
So my question:
How can you 1) sort a data frame with not-unique values by a custom vector like s, and 2) how to include a secondary sort vector for instance to settle ties?
You can order the match results:
df[order(match( df$v1, s), df$v2),]
# v1 v2 v3 v4
# 2 b x 2 id2
# 3 c x 3 id3
# 4 a y 4 id4
# 1 a z 1 id1

Resources