ddply using "group_by" logic - r

I'm trying to use ddply to find the smallest distance between two positions pos where the corresponding chrom is the same in two dataframes:
head(bps, 10)
chrom pos iteration
1 1 4 1
2 1 14 1
3 1 68 1
4 1 79 1
5 1 200 1
6 1 205 1
7 1 270 1
8 1 304 1
9 2 7 1
10 2 13 1
head(flocs)
chrom pos
1 1 100
2 1 200
3 1 220
4 1 312
5 2 500
6 2 501
As an example, for the first line in bps, I want to find the closest pos in flocs where chrom = 1, which gives a value of -96.
The pseudocode for what I'm trying to do is:
foreach iteration (bps$iteration):
foreach chrom (bps$chrom):
foreach pos (bps$pos):
features_pos = pos in dataframe flocs closest to pos on the same chromosome
min_dist = feature_pos - pos
return features_pos, min_dist
I am trying to do this with ddply:
minDists <- ddply(bp_data, c("chrom", "pos"), function(x) {
index <- which.min(abs(flocs$pos[which(flocs$chrom==x$chrom)] - x$pos))
closestMotif <- flocs$pos[index]
chrom <- as.character(flocs$chrom[index])
dist <- (x$pos - closestMotif)
data.frame(features_pos = closestMotif, pos = x$pos, min_dist = dist, feature = feature)
})
But this doesn't constrain comparisons to the same chromosome:
head(minDists, 10)
chrom features_pos pos min_dist feature
1 1 100 4 -96 feature1
2 1 100 14 -86 feature1
3 1 100 68 -32 feature1
4 1 100 79 -21 feature1
5 1 200 200 0 feature1
6 1 200 205 5 feature1
7 1 312 270 -42 feature1
8 1 312 304 -8 feature1
9 2 100 7 -93 feature1 # bps chrom=2, flocs chrom=1
10 2 100 13 -87 feature1 # bps chrom=2, flocs chrom=1
The expected output here is:
chrom features_pos pos min_dist feature
1 1 100 4 -96 feature1
2 1 100 14 -86 feature1
3 1 100 68 -32 feature1
4 1 100 79 -21 feature1
5 1 200 200 0 feature1
6 1 200 205 5 feature1
7 1 312 270 -42 feature1
8 1 312 304 -8 feature1
9 2 500 7 -493 feature1 # bp1 chrom=2, flocs chrom=2
10 2 500 13 -487 feature1 # bp1 chrom=2, flocs chrom=2
I thought that by providing the columns c("chrom", "pos") essentially performed a group_by to the function call.
Is there any way that I can improve what I've written to achieve the desired result?
bps <- structure(list(chrom = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("1", "2", "3"
), class = "factor"), pos = c(4L, 14L, 68L, 79L, 200L, 205L,
270L, 304L, 7L, 13L, 23L, 39L, 100L, 150L, 17L, 55L, 75L, 79L,
102L, 109L, 123L, 155L, 157L, 200L, 260L, 299L, 300L, 320L, 323L,
345L, 450L, 550L), iteration = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "1", class = "factor")), row.names = c(NA,
-32L), class = "data.frame")
flocs <- structure(list(chrom = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 3L,
3L), .Label = c("1", "2", "3"), class = "factor"), pos = c(100L,
200L, 220L, 312L, 500L, 501L, 123L, 444L)), row.names = c(NA,
-8L), class = "data.frame")

data.table approach using a rolling join...
updated answer
(initially forgot all about the by-reference joining, which is faster and most certainly shorter ;-) )
library( data.table )
#set data as data.table
setDT( bps, key = c("chrom", "pos") )
setDT( flocs, key = c("chrom", "pos") )
#perform by-reference rolling join
bps[, mindist := pos - flocs[bps, x.pos, roll = "nearest"]][]
output
# chrom pos iteration mindist
# 1: 1 4 1 -96
# 2: 1 14 1 -86
# 3: 1 68 1 -32
# 4: 1 79 1 -21
# 5: 1 200 1 0
# 6: 1 205 1 5
# 7: 1 270 1 -42
# 8: 1 304 1 -8
# 9: 2 7 1 -493
# 10: 2 13 1 -487
# 11: 2 23 1 -477
# 12: 2 39 1 -461
# 13: 2 100 1 -400
# 14: 2 150 1 -350
# 15: 3 17 1 -106
# 16: 3 55 1 -68
# 17: 3 75 1 -48
# 18: 3 79 1 -44
# 19: 3 102 1 -21
# 20: 3 109 1 -14
# 21: 3 123 1 0
# 22: 3 155 1 32
# 23: 3 157 1 34
# 24: 3 200 1 77
# 25: 3 260 1 137
# 26: 3 299 1 -145
# 27: 3 300 1 -144
# 28: 3 320 1 -124
# 29: 3 323 1 -121
# 30: 3 345 1 -99
# 31: 3 450 1 6
# 32: 3 550 1 106
# chrom pos iteration mindist
Benchmarking answer until now
# Unit: milliseconds
# expr min lq mean median uq max neval
# Ronak_base 2.355879 2.555768 2.973069 2.626415 2.773581 8.016016 100
# Wimpel_data.table 1.697921 2.035788 2.416199 2.209616 2.361001 17.724528 100
# Pawel_tidyverse 14.845354 15.310505 16.333158 15.814819 16.541618 24.077871 100
microbenchmark::microbenchmark(
Ronak_base = {
bps$min_dist <- unlist(mapply(return_min_value, unique(bps$chrom), split(bps$pos, bps$chrom)))
},
Wimpel_data.table = {
setDT( bps, key = c("chrom", "pos") )
setDT( flocs, key = c("chrom", "pos") )
#perform by-reference rolling join
bps[, mindist := pos - flocs[bps, x.pos, roll = "nearest"]][]
},
Pawel_tidyverse = {
bps %>%
select(-iteration) %>%
unite('bps') %>%
crossing(flocs %>% unite('flocks')) %>%
separate(bps, c('chrom_bps', 'pos')) %>%
separate(flocks, c('chrom_flocks', 'features_pos')) %>%
filter(chrom_bps == chrom_flocks) %>%
select(-chrom_flocks) %>%
rename_at(1, ~'chrom') %>%
mutate_all(as.numeric) %>%
mutate(min_dist = pos - features_pos) %>%
group_by(chrom, pos) %>%
filter(abs(min_dist) == min(abs(min_dist)))
}
)
Looks like my data-table answer and the answer by Ronak Shah are pretty close together. I believe that data.table will gain the clear advantage when the data-sets are getting lager-huge (but I haven't tested)..

My base R attempt by creating a helper function (return_min_value). This function subset flocs based on current chrom and then returns the minimum value after subtracting it from pos. We split the pos column based on chrom and pass these values along with unique chrom values in mapply to return_min_value function.
return_min_value <- function(x, y) {
sapply(y, function(p) {
vals = p - flocs$pos[flocs$chrom == x]
vals[which.min(abs(vals))]
})
}
bps$min_dist <- unlist(mapply(return_min_value,
unique(bps$chrom), split(bps$pos, bps$chrom)))
bps
# chrom pos iteration min_dist
#1 1 4 1 -96
#2 1 14 1 -86
#3 1 68 1 -32
#4 1 79 1 -21
#5 1 200 1 0
#6 1 205 1 5
#7 1 270 1 -42
#8 1 304 1 -8
#9 2 7 1 -493
#10 2 13 1 -487
#...

Check this solution:
library(tidyverse)
bps %>%
select(-iteration) %>%
unite('bps') %>%
crossing(flocs %>% unite('flocks')) %>%
separate(bps, c('chrom_bps', 'pos')) %>%
separate(flocks, c('chrom_flocks', 'features_pos')) %>%
filter(chrom_bps == chrom_flocks) %>%
select(-chrom_flocks) %>%
rename_at(1, ~'chrom') %>%
mutate_all(as.numeric) %>%
mutate(min_dist = pos - features_pos) %>%
group_by(chrom, pos) %>%
filter(abs(min_dist) == min(abs(min_dist)))
Output:
chrom pos features_pos min_dist
<dbl> <dbl> <dbl> <dbl>
1 1 4 100 -96
2 1 14 100 -86
3 1 68 100 -32
4 1 79 100 -21
5 1 200 200 0
6 1 205 200 5
7 1 270 312 -42
8 1 304 312 -8
9 2 7 500 -493
10 2 13 500 -487
# ... with 22 more rows

Related

Iteration through rows of a dataframe within group of columns in R

I have a dataframe df with 6 fields A,B,C,D,E & F. My requirement is to create a new column G which is equal to the previous value(C) + previous value(D) + previous (G) - F. But this needs to be implemented at a group level through columns A & B (group by A & B). In case it is the first row within the group then the value in column G should be equal to E.
Sample Df -
A B C D E F
1 2 100 200 300 0
1 2 110 210 310 10
1 2 120 130 300 10
1 1 140 150 80 0
1 1 50 60 80 20
1 1 50 60 80 20
Output -
A B C D E F G
1 2 100 200 300 0 300
1 2 110 210 310 10 590
1 2 120 130 300 10 900
1 1 140 150 80 0 80
1 1 50 60 80 20 350
1 1 50 60 80 20 440
Please provide a suitable solution.
Here is one option with dplyr where we group by 'A', 'B', take the lag of 'C', 'D', 'E' add (+) them, and subtract from 'F', and coalesce with the 'E' column
library(dplyr)
df1 %>%
group_by(A, B) %>%
mutate(G = coalesce(lag(C) + lag(D) + lag(E) - F, E))
-output
# A tibble: 6 x 7
# Groups: A, B [2]
# A B C D E F G
# <int> <int> <int> <int> <int> <int> <int>
#1 1 2 100 200 300 0 300
#2 1 2 110 210 310 10 590
#3 1 2 120 130 300 10 620
#4 1 1 140 150 80 0 80
#5 1 1 50 60 80 20 350
#6 1 1 50 60 80 20 170
data
df1 <- structure(list(A = c(1L, 1L, 1L, 1L, 1L, 1L), B = c(2L, 2L, 2L,
1L, 1L, 1L), C = c(100L, 110L, 120L, 140L, 50L, 50L), D = c(200L,
210L, 130L, 150L, 60L, 60L), E = c(300L, 310L, 300L, 80L, 80L,
80L), F = c(0L, 10L, 10L, 0L, 20L, 20L)), class = "data.frame",
row.names = c(NA,
-6L))

Is there R function for removing specific column condition

Hello all my df looks like
PID V1
123 1
123 2
123 3
111 1
111 2
111 1
122 3
122 1
122 1
333 1
333 4
333 2
I want to delete rows contains 1 and 2 event alone for the PID
and expected output
PID V1
123 1
123 2
123 3
122 3
122 1
122 1
333 1
333 4
333 2
You can do this in base R :
subset(df, !ave(V1 %in% 1:2, PID, FUN = all))
# PID V1
#1 123 1
#2 123 2
#3 123 3
#7 122 3
#8 122 1
#9 122 1
#10 333 1
#11 333 4
#12 333 2
dplyr
library(dplyr)
df %>% group_by(PID) %>% filter(!all(V1 %in% 1:2))
or data.table :
library(data.table)
setDT(df)[, .SD[!all(V1 %in% 1:2)], PID]
The logic of all of them is the same. Remove groups (PID) who have only 1 and 2 in V1 column.
data
df <- structure(list(PID = c(123L, 123L, 123L, 111L, 111L, 111L, 122L,
122L, 122L, 333L, 333L, 333L), V1 = c(1L, 2L, 3L, 1L, 2L, 1L,
3L, 1L, 1L, 1L, 4L, 2L)), class = "data.frame", row.names = c(NA, -12L))

Calculating within group differences R

I’m trying to figure out how to append a column that identifies whether a difference of 10 exists between different IDs for a given day using the column named reading.
**Day ID Reading**
19-Jan 1 10
19-Jan 1 10
19-Jan 1 10
19-Jan 1 20
19-Jan 2 20
19-Jan 2 20
19-Jan 2 20
19-Jan 2 20
20-Jan 1 10
21-Jan 1 10
22-Jan 1 10
23-Jan 1 10
24-Jan 1 20
25-Jan 2 20
25-Jan 2 20
25-Jan 2 20
25-Jan 2 10
I would like:
**Day ID Reading Difference**
19-Jan 1 10 Y
19-Jan 1 10 Y
19-Jan 1 10 Y
19-Jan 1 20 Y
19-Jan 2 20 N
19-Jan 2 20 N
19-Jan 2 20 N
19-Jan 2 20 N
20-Jan 1 10 N
21-Jan 1 10 N
22-Jan 1 10 N
23-Jan 1 10 N
24-Jan 1 20 N
25-Jan 2 20 Y
25-Jan 2 20 Y
25-Jan 2 20 Y
25-Jan 2 10 Y
What you could do is to check whether the difference of the range is equal to or greater than 10 for each group.
dat$Diff <- with(dat, ave(Reading, Day, ID, FUN = function(x) diff(range(x)) >= 10))
dat
# Day ID Reading Diff
#1 19-Jan 1 10 1
#2 19-Jan 1 10 1
#3 19-Jan 1 10 1
#4 19-Jan 1 20 1
#5 19-Jan 2 20 0
#6 19-Jan 2 20 0
#7 19-Jan 2 20 0
#8 19-Jan 2 20 0
#9 20-Jan 1 10 0
#10 21-Jan 1 10 0
#11 22-Jan 1 10 0
#12 23-Jan 1 10 0
#13 24-Jan 1 20 0
#14 25-Jan 2 20 1
#15 25-Jan 2 20 1
#16 25-Jan 2 20 1
#17 25-Jan 2 10 1
data
dat <- structure(list(Day = c("19-Jan", "19-Jan", "19-Jan", "19-Jan",
"19-Jan", "19-Jan", "19-Jan", "19-Jan", "20-Jan", "21-Jan", "22-Jan",
"23-Jan", "24-Jan", "25-Jan", "25-Jan", "25-Jan", "25-Jan"),
ID = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L), Reading = c(10L, 10L, 10L, 20L, 20L, 20L,
20L, 20L, 10L, 10L, 10L, 10L, 20L, 20L, 20L, 20L, 10L)), .Names = c("Day",
"ID", "Reading"), class = "data.frame", row.names = c(NA, -17L
))
We can use data.table
library(data.table)
setDT(df1)[, Difference := abs(Reduce(`-`, as.list(range(Reading)))) >= 10,
.(ID, Day)]
df1
# Day ID Reading Difference
# 1: 19-Jan 1 10 TRUE
# 2: 19-Jan 1 10 TRUE
# 3: 19-Jan 1 10 TRUE
# 4: 19-Jan 1 20 TRUE
# 5: 19-Jan 2 20 FALSE
# 6: 19-Jan 2 20 FALSE
# 7: 19-Jan 2 20 FALSE
# 8: 19-Jan 2 20 FALSE
# 9: 20-Jan 1 10 FALSE
#10: 21-Jan 1 10 FALSE
#11: 22-Jan 1 10 FALSE
#12: 23-Jan 1 10 FALSE
#13: 24-Jan 1 20 FALSE
#14: 25-Jan 2 20 TRUE
#15: 25-Jan 2 20 TRUE
#16: 25-Jan 2 20 TRUE
#17: 25-Jan 2 10 TRUE
data
df1 <- structure(list(Day = c("19-Jan", "19-Jan", "19-Jan", "19-Jan",
"19-Jan", "19-Jan", "19-Jan", "19-Jan", "20-Jan", "21-Jan", "22-Jan",
"23-Jan", "24-Jan", "25-Jan", "25-Jan", "25-Jan", "25-Jan"),
ID = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L), Reading = c(10L, 10L, 10L, 20L, 20L, 20L,
20L, 20L, 10L, 10L, 10L, 10L, 20L, 20L, 20L, 20L, 10L)),
class = "data.frame", row.names = c(NA, -17L))
Using tidyverse you could do something like
library(tidyverse)
your_data %>%
group_by(Day, ID) %>%
mutate(difference = (max(difference) - min(difference)) >= 10)

How to subtract one row from multiple rows by group, for data set with multiple columns in R?

I would like to learn how to subtract one row from multiple rows by group, and save the results as a data table/matrix in R. For example, take the following data frame:
data.frame("patient" = c("a","a","a", "b","b","b","c","c","c"), "Time" = c(1,2,3), "Measure 1" = sample(1:100,size = 9,replace = TRUE), "Measure 2" = sample(1:100,size = 9,replace = TRUE), "Measure 3" = sample(1:100,size = 9,replace = TRUE))
patient Time Measure.1 Measure.2 Measure.3
1 a 1 19 5 75
2 a 2 64 20 74
3 a 3 40 4 78
4 b 1 80 91 80
5 b 2 48 31 73
6 b 3 10 5 4
7 c 1 30 67 55
8 c 2 24 13 90
9 c 3 45 31 88
For each patient, I would like to subtract the row where Time == 1 from all rows associated with that patient. The result would be:
patient Time Measure.1 Measure.2 Measure.3
1 a 1 0 0 0
2 a 2 45 15 -1
3 a 3 21 -1 3
4 b 1 0 0 0
5 b 2 -32 -60 -5
6 b 3 -70 -86 -76
7 c 1 0 0 0
....
I have tried the following code using the dplyr package, but to no avail:
raw_patient<- group_by(rawdata,patient, Time)
baseline_patient <-mutate(raw_patient,cpls = raw_patient[,]- raw_patient["Time" == 0,])
As there are multiple columns, we can use mutate_at by specifying the variables in vars and then subtract the elements from those elements in each column that corresponds to 'Time' 1 after grouping by 'patient'
library(dplyr)
df1 %>%
group_by(patient) %>%
mutate_at(vars(matches("Measure")), funs(.- .[Time==1]))
# A tibble: 9 × 5
# Groups: patient [3]
# patient Time Measure.1 Measure.2 Measure.3
# <chr> <int> <int> <int> <int>
#1 a 1 0 0 0
#2 a 2 45 15 -1
#3 a 3 21 -1 3
#4 b 1 0 0 0
#5 b 2 -32 -60 -7
#6 b 3 -70 -86 -76
#7 c 1 0 0 0
#8 c 2 -6 -54 35
#9 c 3 15 -36 33
data
df1 <- structure(list(patient = c("a", "a", "a", "b", "b", "b", "c",
"c", "c"), Time = c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), Measure.1 = c(19L,
64L, 40L, 80L, 48L, 10L, 30L, 24L, 45L), Measure.2 = c(5L, 20L,
4L, 91L, 31L, 5L, 67L, 13L, 31L), Measure.3 = c(75L, 74L, 78L,
80L, 73L, 4L, 55L, 90L, 88L)), .Names = c("patient", "Time",
"Measure.1", "Measure.2", "Measure.3"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9"))

How to select data that have complete cases of a certain column?

I'm trying to get a data frame (just.samples.with.shoulder.values, say) contain only samples that have non-NA values. I've tried to accomplish this using the complete.cases function, but I imagine that I'm doing something wrong syntactically below:
data <- structure(list(Sample = 1:14, Head = c(1L, 0L, NA, 1L, 1L, 1L,
0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L), Shoulders = c(13L, 14L, NA,
18L, 10L, 24L, 53L, NA, 86L, 9L, 65L, 87L, 54L, 36L), Knees = c(1L,
1L, NA, 1L, 1L, 2L, 3L, 2L, 1L, NA, 2L, 3L, 4L, 3L), Toes = c(324L,
5L, NA, NA, 5L, 67L, 785L, 42562L, 554L, 456L, 7L, NA, 54L, NA
)), .Names = c("Sample", "Head", "Shoulders", "Knees", "Toes"
), class = "data.frame", row.names = c(NA, -14L))
just.samples.with.shoulder.values <- data[complete.cases(data[,"Shoulders"])]
print(just.samples.with.shoulder.values)
I would also be interested to know whether some other route (using subset(), say) is a wiser idea. Thanks so much for the help!
You can try complete.cases too which will return a logical vector which allow to subset the data by Shoulders
data[complete.cases(data$Shoulders), ]
# Sample Head Shoulders Knees Toes
# 1 1 1 13 1 324
# 2 2 0 14 1 5
# 4 4 1 18 1 NA
# 5 5 1 10 1 5
# 6 6 1 24 2 67
# 7 7 0 53 3 785
# 9 9 1 86 1 554
# 10 10 1 9 NA 456
# 11 11 1 65 2 7
# 12 12 1 87 3 NA
# 13 13 0 54 4 54
# 14 14 1 36 3 NA
You could try using is.na:
data[!is.na(data["Shoulders"]),]
Sample Head Shoulders Knees Toes
1 1 1 13 1 324
2 2 0 14 1 5
4 4 1 18 1 NA
5 5 1 10 1 5
6 6 1 24 2 67
7 7 0 53 3 785
9 9 1 86 1 554
10 10 1 9 NA 456
11 11 1 65 2 7
12 12 1 87 3 NA
13 13 0 54 4 54
14 14 1 36 3 NA
There is a subtle difference between using is.na and complete.cases.
is.na will remove actual na values whereas the objective here is to only control for a variable not deal with missing values/na's those which could be legitimate data points

Resources