Identifying duplicates with different IDs - r

I have the following data:
set.seed(26312)
id <- rep(c(1, 2, 3, 4, 5), each = 9)
wrc <- round(runif(36, 20, 100))
wrc <- c(wrc, wrc[10:18])
x <- rep(1:9, 5)
dat <- data.frame(id, wrc, x)
In this data set, id 2 and id 5 contain the exact same data but with different IDs. This can be verified by running,
dat[dat$id == 2, ]
dat[dat$id == 5, ]
I have a much larger data set, with 4321 IDs, and I want to remove these duplicates because even though they have different IDs, they really are duplicates.
Presently I am do a combo of really awful and extremely slow for() and while() loops. In English, what the code is doing is subsetting an id and then comparing that id to every other id that I have subsetted within a while loop. When I find a duplicate, meaning all the rows of data are identical, it should throw away the first id that is a duplicate. The resulting cleaned_data is what I want, it is just unbearable slow to get there. Because it takes roughly 1 minute to do a comparison when I have 4321 ids, so that's about 4321 minutes to run this awful loop. Can someone help?
library("dplyr")
id_check = 1:5
cleaned_data <- data.frame()
for(i in id_check){
compare_tmp <- dat %>% filter(id == i)
compare_check <- compare_tmp %>% select(wrc, x)
duplicate = FALSE
if(i == length(id_check)){
cleaned_data <- rbind(cleaned_data, compare_tmp)
break
} else {
id_tmp = i + 1
}
while(duplicate == FALSE){
check <- dat %>% filter(id == id_tmp) %>% select(wrc, x)
if(nrow(check) == 0) break
duplicate = identical(compare_check, check)
id_tmp = id_tmp + 1
if(id_tmp == (length(id_check) + 1)) {
break
}
}
if(duplicate == FALSE){
cleaned_data <- rbind(cleaned_data, compare_tmp)
}
}
cleaned_data
This is in response to why duplicated won't work. Below ids 2 and 5 are not the same subjects because there data aren't always identical.
set.seed(26312)
id <- rep(c(1, 2, 3, 4, 5), each = 9)
wrc <- round(runif(36, 20, 100))
wrc <- c(wrc, wrc[c(1, 11:18)])
x <- rep(1:9, 5)
dat <- data.frame(id, wrc, x)
dat[dat$id == 2,]
dat[dat$id == 5,]
If I run dat[!duplicated(dat[2:3]),] it removes id 5, when it shouldn't.

If the column structure is accurate, you could convert to wide format for duplicate detection:
dat_wide = reshape2::dcast(dat, id ~ x, value.var = "wrc")
dupes = dat_wide$id[duplicated(dat_wide[-1], fromLast = T)]
no_dupes = dat[!dat$id %in% dupes, ]

Maybe something along the lines of:
do.call(
rbind,
split(dat, dat$id)[!duplicated(lapply(split(dat[2:3], dat$id), `rownames<-`, NULL), fromLast = TRUE)]
)
This splits by id, identifies duplicates, then binds again the non-duplicates.
Edit
Since time is of the essence here, I ran a benchmark of the solutions so far:
set.seed(26312)
p <- 4321
id <- rep(1:p, each = 9)
dats <- replicate(p %/% 2, round(runif(9, 20, 100)), simplify = FALSE)
wrc <- unlist(sample(dats, p, replace = TRUE))
x <- rep(1:9, times = p)
dat <- data.frame(id, wrc, x)
microbenchmark::microbenchmark(
base = {
do.call(
rbind,
split(dat, dat$id)[!duplicated(lapply(split(dat[2:3], dat$id), `rownames<-`, NULL), fromLast = TRUE)]
)
},
tidyr = {
as_tibble(dat) %>%
nest(-id) %>%
filter(!duplicated(data, fromLast = TRUE)) %>%
unnest()
},
reshape = {
dat_wide = reshape2::dcast(dat, id ~ x, value.var = "wrc")
dupes = dat_wide$id[duplicated(dat_wide[-1], fromLast = T)]
no_dupes = dat[!dat$id %in% dupes, ]
},
times = 10L
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# base 892.8239 980.36553 1090.87505 1096.12514 1187.98810 1232.47244 10 c
# tidyr 944.8156 953.10558 977.71756 976.83703 990.58672 1033.27664 10 b
# reshape 49.9955 50.13347 52.20539 51.91833 53.91568 55.64506 10 a

With tidyr:
library(tidyr)
library(dplyr)
as_tibble(dat) %>%
nest(-id) %>%
filter(!duplicated(data, fromLast = TRUE)) %>%
unnest()
# # A tibble: 36 x 3
# id wrc x
# <dbl> <dbl> <int>
# 1 1 53 1
# 2 1 44 2
# 3 1 70 3
# 4 1 31 4
# 5 1 67 5
# 6 1 50 6
# 7 1 70 7
# 8 1 40 8
# 9 1 52 9
# 10 3 95 1
# # ... with 26 more rows
(Note: not sure about the Stackoverflow policy about multiple answers, but this one is different enough to deserve a separate answer IMHO (if it's not, please say so and I'll edit my initial answer and delete this one).

Related

Grouping and Identifying the Maximum During a Simulation in R

I am trying to do some simulations in R and I am stuck on the loop that I need to be doing. I am able to get what I need in one iteration but trying to code the loop is throwing me off. This is what i am doing for one iteration.
Subjects <- c(1,2,3,4,5,6)
Group <- c('A','A','B','B','C','C')
Score <- rnorm(6,mean=5,sd=1)
Example <- data.frame(Subjects,Group,Score)
library(dplyr)
Score_by_Group <- Example %>% group_by(Group) %>% summarise(SumGroup = sum(Score))
Score_by_Group$Top_Group <- ifelse(Score_by_Group[,2] == max(Score_by_Group[,2]),1,0)
Group SumGroup Top_Group
1 A 8.77 0
2 B 6.22 0
3 C 9.38 1
What I need my loop to do is, run the above 'X' times and every time that group has the Top Score, add it to the previous result. So for example, if the loop was to be x=10, I would need a result like this:
Group Top_Group
1 A 3
2 B 5
3 C 2
If you don't mind forgoing the for loop, we can use replicate to repeat the code, then bind the output together, and then summarize.
library(tidyverse)
run_sim <- function()
{
Subjects <- c(1, 2, 3, 4, 5, 6)
Group <- c('A', 'A', 'B', 'B', 'C', 'C')
Score <- rnorm(6, mean = 5, sd = 1)
Example <- data.frame(Subjects, Group, Score)
Score_by_Group <- Example %>%
group_by(Group) %>%
summarise(SumGroup = sum(Score)) %>%
mutate(Top_Group = +(SumGroup == max(SumGroup))) %>%
select(-SumGroup)
}
results <- bind_rows(replicate(10, run_sim(), simplify = F)) %>%
group_by(Group) %>%
summarise(Top_Group = sum(Top_Group))
Output
Group Top_Group
<chr> <int>
1 A 3
2 B 3
3 C 4
I think this should work:
library(dplyr)
Subjects <- c(1,2,3,4,5,6)
Group <- c('A','A','B','B','C','C')
Groups <- c('A','B','C')
Top_Group <- c(0,0,0)
x <- 10
for(i in 1:x) {
Score <- rnorm(6,mean=5,sd=1)
Example <- data.frame(Subjects,Group,Score)
Score_by_Group <- Example %>% group_by(Group) %>% summarise(SumGroup = sum(Score))
Score_by_Group$Top_Group <- ifelse(Score_by_Group[,2] == max(Score_by_Group[,2]),1,0)
Top_Group <- Top_Group + Score_by_Group$Top_Group
}
tibble(Groups, Top_Group)

Add the number in every row and take the sum

Having a dataframe like this
data.frame(id = c(1,2), num = c("30, 4, -2,","10, 20"))
How is it possible to take the sum of every row from the column num, and include the minuse into the calculation?
Example of expected output?
data.frame(id = c(1,2), sum = c(32, 30)
Using Base R you could do the following:
# data
df <- data.frame(id = c(1,2), num = c("30, 4, -2,","10, 20"))
# split by ",", convert to numeric and then sum
df[, 2] <- sapply(strsplit(as.character(df$num), ","), function(x){
sum(as.numeric(x))
})
# result
df
# id num
# 1 1 32
# 2 2 30
If you can use packages, the tidy packages make this easy and use tidy data principals which are quick and easy once you get used to thinking this way.
library(tidyr)
library(dplyr)
df %>%
# Convert the string of numbers to a tidy dataframe
# with one number per row with the id column for grouping
separate_rows(num,sep = ",") %>%
# Convert the text to a number so we can sum
mutate(num = as.numeric(num)) %>%
# Perform the calculation for each id
group_by(id) %>%
# Sum the number
summarise(sum = sum(num,na.rm = TRUE)) %>%
# Ungroup for further use of the data
ungroup()
# A tibble: 2 x 2
# id sum
# <dbl> <dbl>
# 1 1 32
# 2 2 30
library(stringr)
df <- data.frame(id = c(1,2), num = c("30, 4, -2","10, 20"))
df$sum <- NA
for (i in 1:nrow(df)) {
temp <- as.character(df[i,2])
n_num <- str_count(temp, '[0-9.]+')
total <- 0
for (j in 1:n_num) {
digit <- strsplit(temp, ',')[[1]][j]
total <- total + as.numeric(digit)
temp <- sub(digit, '', temp)
}
df[i, 'sum'] <- total
}
print(df)
id num sum
1 1 30, 4, -2 32
2 2 10, 20 30

Create new data frame with mean of multiple rows

I have a dataframe consisting of three columns: ID, Trial and a difference measure (diff_DT). I have 19 participant, who completed 30 trials each. This is how my dataframe looks like:
ID Trial diff_DT
01 005 37,5
01 006 40,5
01 007 16,5
... ... ...
02 005 16,5
... ... ...
02 016 27,9
Always six of the 30 trials belong to one block: block 1: trial 5-10, block 2: trial 16-21, block 3: trial 26-31, block 4: trial 36-41, block 5: trial 46-51 (Note: trial numbers are > 30, because participants completed more trials in total)
Now I need the mean of the variable diff_DT for each participant for each block, resulting in five means for each participant. And I have no idea how to do it properly.
Thanks for your advices!
You can create a separate key data frame or matrix for the blocks/trials, merge that to your original table, and then run aggregate to get the mean score.
ID <- c(rep(1, 3), 2, 2)
Trial <- c(5, 6, 7, 5, 16)
diff_DT <- c(37.5, 40.5, 16.5, 16.5, 27.9)
Trial.key <- c(5:10, 16:21, 26:31, 36:41, 46:51)
block <- rep(1:5, each = 6)
df <- data.frame(ID, Trial, diff_DT)
blocks <- data.frame(Trial.key, block)
df.blocks <- merge(df, blocks, by.x = "Trial", by.y = "Trial.key", all.x = TRUE,
all.y = FALSE)
df.blocks
# Trial ID diff_DT block
# 5 1 37.5 1
# 5 2 16.5 1
# 6 1 40.5 1
# 7 1 16.5 1
# 16 2 27.9 2
df.agg <- with(df.blocks, aggregate(diff_DT, by = list(ID, Trial),
FUN = "mean"))
names(df.agg) <- c("ID", "Trial", "mean.diff_DT")
df.agg
# ID Trial mean.diff_DT
# 1 5 37.5
# 2 5 16.5
# 1 6 40.5
# 1 7 16.5
# 2 16 27.9
If you want to only use base R, one way to do what you want is to create a column block in your dataframe and then apply the mean function for each participant in each block.
If Trial is numeric (which may not be the case given that your trials are 001, 002...), you can
df$block = ifelse(df$trial>=5 & df$trial <=10, 1,
ifelse(df$trial>=16 & df$trial <=21,2,
ifelse(df$trial>=26 & df$trial <=31,3,
ifelse(df$trial>=36 & df$trial <=41,4,
ifelse(df$trial>=46 & df$trial <=51,5,0))))
)
if Trial is not numeric (eg. character or factor), you should first convert it to numeric with
df$trial = as.numeric(as.character(df$trial))
Then you just have to
aggregate(df$trial, by=list(df$block,df$id), mean)
See if this will help you.
bd <- data.frame(ID = rep(1:6, each = 30),
Trial = c(sample(c(5:10,16:21,26:31,36:41,46:51), 30),
sample(c(5:10,16:21,26:31,36:41,46:51), 30),
sample(c(5:10,16:21,26:31,36:41,46:51), 30),
sample(c(5:10,16:21,26:31,36:41,46:51), 30),
sample(c(5:10,16:21,26:31,36:41,46:51), 30),
sample(c(5:10,16:21,26:31,36:41,46:51), 30)),
diff_DT = rnorm(n = 180, mean = 30, sd = 2))
library(dplyr)
bd <- bd %>%
mutate(block = ifelse(Trial <= 10, 1,
ifelse(Trial <= 21, 2,
ifelse(Trial <= 31, 3,
ifelse(Trial <= 41, 4, 5)))))
bd %>%
group_by(ID, block) %>%
summarise(Mean = mean(diff_DT))
I wrote this dataframe as an example (you should provide code for generating your data to make answering easier and more precise):
ID <- rep(1:3, 47)
trial <- rep(5:51, 3)
diff_DT <- sample(1:10, 47*3, replace = T)
df <- data.frame(ID, trial, diff_DT)
Then I wrote a function to compute blocks, the blocks are assigned like the ones you wrote in the question, if you need some precisation just ask:
computeBlocks <- function(df){
block <- rep(NA, nrow(df))
for(i in 1:length(block)){
for(j in 1:4){
if(as.numeric(df$trial[i]) >= 6+10*j && as.numeric(df$trial[i]) <= 11+10*j){
block[i] <- j+1
break
}
}
if(as.numeric(df$trial[i]) >= 5 && as.numeric(df$trial[i]) <= 10){
block[i] <- 1
}
}
df <- cbind(df, block)
return(df)
}
And I computed the blocks:
df <- computeBlocks(df)
Finally using the package reshape2 I computed the mean per participant per block:
#install.packages("reshape2")
require(reshape2)
df_melt <- melt(df, id = c("ID", "block"))
means <- dcast(df_melt, ID + block ~ variable, mean)[,-3]
means
Your question wasn't so clear so let me know if it needs to be improved.
I think this could be a simple way to do it:
library(dplyr)
# Create a table to map which Block each trial refers to
Trial <- c(5:10,16:21,26:31,36:41,46:51)
Block <- rep(1:5, each = 6)
map <- data_frame(Trial, Block)
# Take original data frame and join the map to add what Block it belongs to. Then group it first by participant ID, then Block, and summarise by mean
df2 <- df %>%
left_join(map, by = "Trial") %>%
group_by(ID, Block) %>%
summarise(mean = mean(diff_DT))

Grouped operation on all groups relative to "baseline" group, with multiple observations

Starting with data containing multiple observations for each group, like this:
set.seed(1)
my.df <- data.frame(
timepoint = rep(c(0, 1, 2), each= 3),
counts = round(rnorm(9, 50, 10), 0)
)
> my.df
timepoint counts
1 0 44
2 0 52
3 0 42
4 1 66
5 1 53
6 1 42
7 2 55
8 2 57
9 2 56
To perform a summary calculation at each timepoint relative to timepoint == 0, for each group I need to pass a vector of counts for timepoint == 0 and a vector of counts for the group (e.g. timepoint == 0) to an arbitrary function, e.g.
NonsenseFunction <- function(x, y){
(mean(x) - mean(y)) / (1 - mean(y))
}
I can get the required output from this table, either with dplyr:
library(dplyr)
my.df %>%
group_by(timepoint) %>%
mutate(rep = paste0("r", 1:n())) %>%
left_join(x = ., y = filter(., timepoint == 0), by = "rep") %>%
group_by(timepoint.x) %>%
summarise(result = NonsenseFunction(counts.x, counts.y))
or data.table:
library(data.table)
my.dt <- data.table(my.df)
my.dt[, rep := paste0("r", 1:length(counts)), by = timepoint]
merge(my.dt, my.dt[timepoint == 0], by = "rep", all = TRUE)[
, NonsenseFunction(counts.x, counts.y), by = timepoint.x]
This only works if the number of observations between groups is the same. Anyway, the observations aren't matched, so using the temporary rep variable seems hacky.
For a more general case, where I need to pass vectors of the baseline values and the group's values to an arbitrary (more complicated) function, is there an idiomatic data.table or dplyr way of doing so with a grouped operation for all groups?
Here's the straightforward data.table approach:
my.dt[, f(counts, my.dt[timepoint==0, counts]), by=timepoint]
This probably grabs my.dt[timepoint==0, counts] again and again, for each group. You could instead save that value ahead of time:
v = my.dt[timepoint==0, counts]
my.dt[, f(counts, v), by=timepoint]
... or if you don't want to add v to the environment, maybe
with(list(v = my.dt[timepoint==0, counts]),
my.dt[, f(counts, v), by=timepoint]
)
You could give the second argument to use the vector from your group of interest as a constant.
my.df %>%
group_by(timepoint) %>%
mutate(response = NonsenseFunction(counts, my.df$counts[my.df$timepoint == 0]))
Or if you want to make it beforehand:
constant = = my.df$counts[my.df$timepoint == 0]
my.df %>%
group_by(timepoint) %>%
mutate(response = NonsenseFunction(counts, constant))
You can try,
library(dplyr)
my.df %>%
mutate(new = mean(counts[timepoint == 0])) %>%
group_by(timepoint) %>%
summarise(result = NonsenseFunction(counts, new))
# A tibble: 3 × 2
# timepoint result
# <dbl> <dbl>
#1 0 0.0000000
#2 1 0.1398601
#3 2 0.2097902

Identify values that appear a certain number of times in an R data frame

I have a data frame of strings, most of which are duplicated. I would like to identify values in this data frame that occur at least x number of times.
df <- data.frame(x = c("str", "str", "str", "ing", "ing","."))
occurs <- 3
The data frame contains hundreds of unique strings, and tens of thousands of elements altogether. In this example, how I can identify which strings are occurring at least three times? Specifically, I would like to output the names of the strings that meet this criterion, not their indices in the data frame.
Maybe table is what you need - here's a modified example based on your code:
> df <- data.frame(x = c("str", "str", "str", "ing", "ing","."))
> df
x
1 str
2 str
3 str
4 ing
5 ing
6 .
> table(df$x)
. ing str
1 2 3
> table(df$x) > 2
. ing str
FALSE FALSE TRUE
> names(which(table(df$x) > 2))
[1] "str"
You could also use count:
library(dplyr)
df %>% count(x)
This will call n() to count the number of observations for each x:
# Source: local data frame [3 x 2]
#
# x n
# 1 . 1
# 2 ing 2
# 3 str 3
If you only want those occurring at least 3 times, use filter():
df %>% count(x) %>% filter(n >= 3)
Which gives:
# Source: local data frame [1 x 2]
#
# x n
# 1 str 3
Finally, if you only want to extract the factors that correspond to your filter criteria:
df %>% count(x) %>% filter(n >= 3) %>% .$x
# [1] str
# Levels: . ing str
As per suggested by #David in the comments, you could also use data.table:
library(data.table)
setDT(df)[, if(.N >= 3) x, by = x]$V1
Or
setDT(df)[, .N, by = x][, x[N >= 3]]
# [1] str
# Levels: . ing str
As per suggested by #Frank, you could also use table's "workhorse" tabulate:
levels(df[[1]])[tabulate(df[[1]])>=3]
# [1] "str"
Benchmark
df <- data.frame(x = sample(LETTERS[1:26], 10e6, replace = TRUE))
df2 <- copy(df)
library(microbenchmark)
mbm <- microbenchmark(
base = names(which(table(df$x) >= 385000)),
base2 = levels(df[[1]])[tabulate(df[[1]])>385000L],
dplyr = count(df, x) %>% filter(n >= 385000) %>% .$x,
DT1 = setDT(df2)[, if(.N >= 385000) x, by = x]$V1,
DT2 = setDT(df2)[, .N, by = x][, x[N >= 385000]],
times = 50
)
> mbm
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# base 495.44936 523.29186 545.08199 543.56660 551.90360 652.13492 50 d
# base2 20.08123 20.09819 20.11988 20.10633 20.14137 20.20876 50 a
# dplyr 226.75800 227.27992 231.19709 228.36296 232.71308 259.20770 50 c
# DT1 41.03576 41.28474 50.92456 48.40740 48.66626 168.53733 50 b
# DT2 41.45874 41.85510 50.76797 48.93944 49.49339 74.58234 50 b

Resources