This question already has answers here:
How can I remove all duplicates so that NONE are left in a data frame?
(3 answers)
Closed 12 months ago.
Here an example of a matrix,
A
B
C
1
1
1
1
1
4
1
2
4
2
1
1
3
1
1
3
1
2
I would like extract only rows which are unique in A and B.
I can't use unique, duplicate etc. because they retain always one of my duplicated row.
In final result I wish obtain:
A
B
C
1
2
4
2
1
1
How can I do it?
Thank you
Here are couple of options -
Base R -
cols <- c('A', 'B')
res <- df[!(duplicated(df[cols]) | duplicated(df[cols], fromLast = TRUE)), ]
res
# A B C
#3 1 2 4
#4 2 1 1
dplyr -
library(dplyr)
df %>% group_by(A, B) %>% filter(n() == 1) %>% ungroup
# A tibble: 2 x 3
# A B C
# <int> <int> <int>
#1 1 2 4
#2 2 1 1
data.table
df <- data.frame(
A = c(1L, 1L, 1L, 2L, 3L, 3L),
B = c(1L, 1L, 2L, 1L, 1L, 1L),
C = c(1L, 4L, 4L, 1L, 1L, 2L)
)
library(data.table)
setDT(df)[, .SD[.N == 1], by = list(A, B)]
#> A B C
#> 1: 1 2 4
#> 2: 2 1 1
Created on 2022-02-28 by the reprex package (v2.0.1)
Related
I have a group nested within another group in my data. I would like to randomise the order of the nested groups while preserving the order of the rows within each nested group. (This will be a step within an existing pipe, so a tidyverse solution would be ideal.)
In the example below, how do I randomise the order of block within participant_id, while also preserving the order of both participant_id and trial?
library(dplyr)
set.seed(123)
# dummy data
data <- tibble::tribble(
~participant_id, ~block, ~trial,
1L, "a", 1L,
1L, "a", 2L,
1L, "a", 3L,
1L, "b", 1L,
1L, "b", 2L,
1L, "b", 3L,
2L, "a", 1L,
2L, "a", 2L,
2L, "a", 3L,
2L, "b", 1L,
2L, "b", 2L,
2L, "b", 3L
)
# something along the lines of...
new_data <- data %>%
group_by(participant_id) %>%
# ? step here to randomise order within 'block', while preserving order within 'trial'.
Thanks.
And here's another:
# Randomise within one participant
randomiseGroup <- function(.x, .y) {
# Generalise to that any number of blocks can be handled
r <- .x %>%
distinct(block) %>%
mutate(random=runif(nrow(.)))
# Randomise
.y %>%
bind_cols(
.x %>%
ungroup() %>%
left_join(r, by="block") %>%
arrange(random, trial) %>%
select(-random)
)
}
# Randomise all participants
data %>%
group_by(participant_id) %>%
group_map(randomiseGroup) %>%
bind_rows()
# A tibble: 12 × 3
participant_id block trial
<int> <chr> <int>
1 1 a 1
2 1 a 2
3 1 a 3
4 1 b 1
5 1 b 2
6 1 b 3
7 2 b 1
8 2 b 2
9 2 b 3
10 2 a 1
11 2 a 2
12 2 a 3
One option could be:
data %>%
group_by(participant_id) %>%
mutate(rleid = cumsum(block != lag(block, default = first(block))),
block_random = sample(n())) %>%
group_by(participant_id, rleid) %>%
mutate(block_random = min(block_random)) %>%
ungroup()
participant_id block trial rleid block_random
<int> <chr> <int> <int> <int>
1 1 a 1 0 2
2 1 a 2 0 2
3 1 a 3 0 2
4 1 b 1 1 1
5 1 b 2 1 1
6 1 b 3 1 1
7 2 a 1 0 2
8 2 a 2 0 2
9 2 a 3 0 2
10 2 b 1 1 1
11 2 b 2 1 1
12 2 b 3 1 1
I have a dataframe such as:
COL1 VALUE1 VALUE2
1 A,A 1 5
2 A,A,B 1 3
3 C 1 1
4 D 1 2
5 D 1 2
6 A,A 1 10
7 A,B,A 1 2
and I can succeed to remove duplicate within the COL1 and count the number of different duplicated in COL1 by using:
as.data.frame(table(tab$COL1)) %>%
group_by(Var1 = sapply(strsplit(as.character(Var1), ","), function(x) toString(unique(x)))) %>%
summarise(Freq = sum(Freq))
And then I get:
# A tibble: 4 × 2
Var1 Freq
<chr> <int>
1 A 2
2 A, B 2
3 C 1
4 D 2
But I wondered if someone had an idea in order to add a new column called Mean which would be for each COL1 groups, the mean of the VALUE2 values and then get:
Var1 Freq Mean
1 A 2 7.5 < because (5+10)/2 =7.5
2 A, B 2 2.5 < because (3+2)/2 =2.5
3 C 1 1 < because 1/1 = 1
4 D 2 2 < because (2+2)/2 = 2
Here is the dataframe if it can helps:
structure(list(COL1 = structure(c(1L, 2L, 4L, 5L, 5L, 1L, 3L), .Label = c("A,A",
"A,A,B", "A,B,A", "C", "D"), class = "factor"), VALUE1 = c(1L,
1L, 1L, 1L, 1L, 1L, 1L), VALUE2 = c(5L, 3L, 1L, 2L, 2L, 10L,
2L)), class = "data.frame", row.names = c(NA, -7L))
You can calculate the frequency table directly in the dplyr chain, and then just add a Mean = mean(VALUE2) in the summarise() call.
I.e.
tab %>%
group_by(Var1 = sapply(strsplit(as.character(COL1), ","), function(x) toString(unique(x)))) %>%
summarise(Freq = sum(VALUE1), Mean = mean(VALUE2))
# # A tibble: 4 x 3
# Var1 Freq Mean
# <chr> <int> <dbl>
# 1 A 2 7.5
# 2 A, B 2 2.5
# 3 C 1 1
# 4 D 2 2
Is this what you want:
library(dplyr)
tab %>%
mutate(COL1 = sapply(strsplit(as.character(COL1), ","), function(x) toString(unique(x)))) %>%
group_by(COL1) %>%
summarise(Freq = sum(VALUE1),
Mean = mean(VALUE2))
# A tibble: 4 x 3
COL1 Freq Mean
* <chr> <int> <dbl>
1 A 2 7.5
2 A, B 2 2.5
3 C 1 1
4 D 2 2
suppose I want to find duplicate rows for columns:
cols<-c("col1", "col2")
I know for data f4 duplicate rows are:
Jo<-df4[duplicated(df4[cols]) | duplicated(df4[cols], fromLast = TRUE), ]
and removing these duplicate rows from data set is given:
No<-df4[!(duplicated(df4[cols]) | duplicated(df4[cols], fromLast = TRUE)), ]
I want to modify the above codes. Suppose there is a column called mode. It takes integers between 1 to 4. I don't want all of duplicate rows have the same mode==2.
example
col1 col2 mode
1 3 5
5 3 9
1 2 1
1 2 1
3 2 2
3 2 2
4 1 3
4 1 2
4 1 2
output
Jo:
col1 col2 mode
1 2 1
1 2 1
4 1 3
4 1 2
4 1 2
No:
col1 col2 mode
1 3 5
5 3 9
3 2 2
3 2 2
in the above example in 3 and 4-th rows since mode==2 for both it is not duplicate but for three last row since one of them is not 2 , the are duplicate
Based on the updated dataset,
library(dplyr)
out1 <- df2 %>%
group_by_at(vars(cols)) %>%
filter(n() > 1, !all(mode ==2))
out2 <- anti_join(df2, out1)
out1
# A tibble: 5 x 3
# Groups: col1, col2 [2]
# col1 col2 mode
# <int> <int> <int>
#1 1 2 1
#2 1 2 1
#3 4 1 3
#4 4 1 2
#5 4 1 2
out2
# col1 col2 mode
#1 1 3 5
#2 5 3 9
#3 3 2 2
#4 3 2 2
Or with data.table
library(data.table)
i1 <- setDT(df2)[ , .I[.N > 1 & !all(mode == 2)], by = cols]$V1
df2[i1]
# col1 col2 mode
#1: 1 2 1
#2: 1 2 1
#3: 4 1 3
#4: 4 1 2
#5: 4 1 2
df2[!i1]
# col1 col2 mode
#1: 1 3 5
#2: 5 3 9
#3: 3 2 2
#4: 3 2 2
Or using base R
i1 <- duplicated(df2[1:2])|duplicated(df2[1:2], fromLast = TRUE)
out11 <- df2[i1 & with(df2, !ave(mode==2, col1, col2, FUN = all)),]
out22 <- df2[setdiff(row.names(df2), row.names(out11)),]
data
df2 <- structure(list(col1 = c(1L, 5L, 1L, 1L, 3L, 3L, 4L, 4L, 4L),
col2 = c(3L, 3L, 2L, 2L, 2L, 2L, 1L, 1L, 1L), mode = c(5L,
9L, 1L, 1L, 2L, 2L, 3L, 2L, 2L)), class = "data.frame", row.names = c(NA,
-9L))
ID Number Var
1 2 6
1 2 7
1 1 8
1 2 9
1 2 10
2 2 3
2 2 4
2 1 5
2 2 6
Each person has several records.
There is only one record of a person whose Number is 1, the rest is 2.
The variable Var has different values for the same person.
When the Number equals to 1, the corresponding Var (we call it P) is different for different persons.
Now, I want to delete the rows whose Var > P for every person.
At the end, I want this
ID Number Var
1 2 6
1 2 7
1 1 8
2 2 3
2 2 4
2 1 5
You can use dplyr::first where Num==1 to get the first Var value
library(dplyr)
df %>% group_by(ID) %>% mutate(Flag=first(Var[Number==1])) %>%
filter(Var <= Flag) %>% select(-Flag)
#short version and you sure there is a one Num==1
df %>% group_by(ID) %>% filter(Var <= Var[Number==1])
Here is a solution with data.table:
library(data.table)
dt <- fread(
"ID Number Var
1 2 6
1 2 7
1 1 8
1 2 9
1 2 10
2 2 3
2 2 4
2 1 5
2 2 6")
dt[, .SD[Var <= Var[Number==1]], ID]
# ID Number Var
# 1: 1 2 6
# 2: 1 2 7
# 3: 1 1 8
# 4: 2 2 3
# 5: 2 2 4
# 6: 2 1 5
A base R option would be
df1[with(df1, Var <= ave(Var * (Number == 1), ID, FUN = function(x) x[x!=0])),]
# ID Number Var
#1 1 2 6
#2 1 2 7
#3 1 1 8
#6 2 2 3
#7 2 2 4
#8 2 1 5
data
df1 <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), Number = c(2L,
2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L), Var = c(6L, 7L, 8L, 9L, 10L,
3L, 4L, 5L, 6L)), row.names = c(NA, -9L), class = "data.frame")
I'm looking for a solution to add the column "desired_result" preferably using dplyr and/or ave(). See the data frame here, where the group is "section" and the unique instances I want my "desired_results" column to count sequentially are in "exhibit":
structure(list(section = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), exhibit = structure(c(1L,
2L, 3L, 3L, 1L, 2L, 2L, 3L), .Label = c("a", "b", "c"), class = "factor"),
desired_result = c(1L, 2L, 3L, 3L, 1L, 2L, 2L, 3L)), .Names = c("section",
"exhibit", "desired_result"), class = "data.frame", row.names = c(NA,
-8L))
dense_rank it is
library(dplyr)
df %>%
group_by(section) %>%
mutate(desire=dense_rank(exhibit))
# section exhibit desired_result desire
#1 1 a 1 1
#2 1 b 2 2
#3 1 c 3 3
#4 1 c 3 3
#5 2 a 1 1
#6 2 b 2 2
#7 2 b 2 2
#8 2 c 3 3
I've recently pushed a function rleid() to data.table (currently available on the development version, 1.9.5), which does exactly this. If you're interested, you can install it by following this.
require(data.table) # 1.9.5, for `rleid()`
require(dplyr)
DF %>%
group_by(section) %>%
mutate(desired_results=rleid(exhibit))
# section exhibit desired_result desired_results
# 1 1 a 1 1
# 2 1 b 2 2
# 3 1 c 3 3
# 4 1 c 3 3
# 5 2 a 1 1
# 6 2 b 2 2
# 7 2 b 2 2
# 8 2 c 3 3
If exact enumeration is necessary and you need the desired result to be consistent (so that a same exhibit in a different section will always have the same number), you can try:
library(dplyr)
df <- data.frame(section = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L),
exhibit = c('a', 'b', 'c', 'c', 'a', 'b', 'b', 'c'))
if (is.null(saveLevels <- levels(df$exhibit)))
saveLevels <- sort(unique(df$exhibit)) ## or levels(factor(df$exhibit))
df %>%
group_by(section) %>%
mutate(answer = as.integer(factor(exhibit, levels = saveLevels)))
## Source: local data frame [8 x 3]
## Groups: section
## section exhibit answer
## 1 1 a 1
## 2 1 b 2
## 3 1 c 3
## 4 1 c 3
## 5 2 a 1
## 6 2 b 2
## 7 2 b 2
## 8 2 c 3
If/when a new exhibit appears in subsequent sections, they should get newly enumerated results. (Notice the last exhibit is different.)
df2 <- data.frame(section = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L),
exhibit = c('a', 'b', 'c', 'c', 'a', 'b', 'b', 'd'))
if (is.null(saveLevels2 <- levels(df2$exhibit)))
saveLevels2 <- sort(unique(df2$exhibit))
df2 %>%
group_by(section) %>%
mutate(answer = as.integer(factor(exhibit, levels = saveLevels2)))
## Source: local data frame [8 x 3]
## Groups: section
## section exhibit answer
## 1 1 a 1
## 2 1 b 2
## 3 1 c 3
## 4 1 c 3
## 5 2 a 1
## 6 2 b 2
## 7 2 b 2
## 8 2 d 4