correlation with a reference group in a dataframe - r

Consider noMissing data frame.
library(lubridate)
set.seed(123)
value <- rnorm(300)
value[sample(1:300,10)]<- NA
b <- rep(c("a","b", "c", "d","e", "f"), each=50)
b[sample(1:300,12)] <- NA
c <- rep(rep(as.character(1:2), each = 25) , 6)
c[sample(1:300,10)] <- NA
datee <- seq(lubridate::ymd("2012-01-01"),lubridate::ymd("2012-01-01") + 24 , by = "days")
datee <- rep(datee, 12)
datee[sample(1:300,20)] <- NA
dataframe <- cbind.data.frame( b, c, datee, value)
noMissing <- dataframe[complete.cases(dataframe),]
head(noMissing)
b c datee value
1 a 1 2012-01-01 -0.56047565
2 a 1 2012-01-02 -0.23017749
3 a 1 2012-01-03 1.55870831
4 a 1 2012-01-04 0.07050839
5 a 1 2012-01-05 0.12928774
6 a 1 2012-01-06 1.71506499
Now I want to group data by columns b, and c then calculate the correlation of each group with group a in b column which has the same dates in datee column as the other group.
For example correlation between b, 1 and the refrenced group a show in the following picture
My initial solution:
b_unique <- unique(noMissing$b)
c_unique <- unique(noMissing$c)
out <- list()
v <- 0
for (i in 1:length(b_unique)) {
v <- v + 1
group <- noMissing[noMissing$b==b_unique[i] & noMissing$c == c_unique[k],]
ref <- noMissing[noMissing$b=="a" & noMissing$c == c_unique[k] ,]
inter <-ymd("1970-01-01") + intersect(ref$datee, group$datee )
x <- cor(group$value[group$datee %in% inter],ref[ref$datee %in% inter , "value"])
out[[v]] <- list(b = b_unique[i], c = c_unique[k], cor = x)
}
}
dplyr::bind_rows(out)
b c cor
<fct> <fct> <dbl>
1 a 1 1.000
2 a 2 1
3 b 1 0.175
4 b 2 -0.247
5 c 1 0.216
6 c 2 0.101
7 d 1 0.159
8 d 2 -0.253
9 e 1 0.177
10 e 2 -0.528
11 f 1 0.179
12 f 2 -0.178
I am seeking good taste coding solutions

You can do the following:
library(data.table)
# convert the data shape to have datewise information across all groups
df <- dcast(data.table(noMissing), datee+c ~ b, value.var='value')
# rename c as c_1 column as there are multiple column with c name
setnames(df, old = 2, new = 'c_1')
# groupby 'c_1' and for each group calculate correlation between b-a, c-a, d-a, e-a and so on
df <- df[,
lapply(.SD[,-c('datee'), with=F], function(x) {
cols <- c('a','b','c','d','e','f')
vals <- vector(mode = 'numeric', length = 6)
for(i in seq(cols)) {vals[i] <- (cor(get(cols[i]), get(('a')), use = complete.obs'))}
return (vals)})
,c_1]
# finally reshape the table as you posted in solution above.
df <- melt(df, id.vars = c('c_1'))
colnames(df) <- c('c','b','cor')
c b cor
1: 1 a 1.00000000
2: 1 a -0.12499728
3: 1 a -0.13133257
4: 1 a 0.02573947
5: 1 a 0.07239559
6: 1 a -0.07421281

Related

Using lapply with data in two lists in R

I have two lists and I want to use lapply to get a new list
The data is
library(dplyr)
list.A <- list(df1=data.frame(x= c(1:5), y = letters[1:5], z= rep(1,5)),
df2=data.frame(x= c(10:15), y = letters[5:10], z= rep(10,6)))
list.B <- list(df1=data.frame(x= c(1:6), var2 = letters[10:15], var3= rep(7,6)),
df2=data.frame(x= c(10,12), var2 = letters[1:2], var3= rep(5,2)))
I want the result to be as following
dat.1 <- left_join(list.A[[1]], list.B[[1]], by=("x"))
dat.2 <- left_join(list.A[[2]], list.B[[2]], by=("x"))
new.list <- list(df1 = dat.1, df2 =dat.2)
But when I use lapply the results are weird and not as I wish them to be
new.list <- lapply(list.A, function(a){lapply(list.B, function(b){
df <-left_join(a, b, by=("x"))
})
})
Any help, please. I need to apply loop or lapply would work?
my actual lists have so many data frames
We need either map2 from purrr as this loops over each corresponding elements of both list and do the left_join by the 'x' column
library(dplyr)
library(purrr)
map2(list.A, list.B, ~ left_join(.x, .y, by = 'x'))
-output
#$df1
# x y z var2 var3
#1 1 a 1 j 7
#2 2 b 1 k 7
#3 3 c 1 l 7
#4 4 d 1 m 7
#5 5 e 1 n 7
#$df2
# x y z var2 var3
#1 10 e 10 a 5
#2 11 f 10 <NA> NA
#3 12 g 10 b 5
#4 13 h 10 <NA> NA
#5 14 i 10 <NA> NA
#6 15 j 10 <NA> NA
Or Map (from base R)
Map(merge, list.A, list.B, MoreArgs = list(all.x = TRUE, by = 'x'))

aggregate and removes duplicates for some rows

I have a dataset like
df <- data.frame(id = c("a","a","b","b","c","d","e","f"),
val = c(1,2,3,4,5,6,7,8),
extracol = c("x",NA,"y","z","t","v","u","p"))
id val extracol
1 a 1 x
2 a 2 <NA>
3 b 3 y
4 b 4 z
5 c 5 t
6 d 6 v
7 e 7 u
8 f 8 p
and I want to sum (and aggregate) the values according to the column id but only for "a". So I want to get something like:
id val extracol
1 a 3 x
2 b 3 y
3 b 4 z
4 c 5 t
5 d 6 v
6 e 7 u
7 f 8 p
I really don't care if I get "x" or NA in the extracol. Any suggestion?
This would work:
library(dplyr)
df <- data.frame(id = c("a","a","b","b","c","d","e","f"),
val = c(1,2,3,4,5,6,7,8),
extracol = c("x",NA,"y","z","t","v","u","p"))
# keep only a
a = df%>% filter(id == "a")
# aggregate a
a_agg= a %>% group_by(id) %>% summarise(val = sum(val), extracol = first(extracol))
# drop a
df = df %>% filter(id != "a")
# append a
df = rbind(df, a_agg)
df
id val extracol
1 b 3 y
2 b 4 z
3 c 5 t
4 d 6 v
5 e 7 u
6 f 8 p
7 a 3 x
A base R option
with(
df,
rbind(
data.frame(
id = "a",
val = sum(val[id == "a"]),
extracol = na.omit(extracol[id == "a"])
),
df[id != "a", ]
)
)
gives
id val extracol
1 a 3 x
3 b 3 y
4 b 4 z
5 c 5 t
6 d 6 v
7 e 7 u
8 f 8 p

filter one dataframe via conditions in another

I want to recursively filter a dataframe, d by an arbitrary number of conditions (represented as rows in another dataframe z).
I begin with a dataframe d:
d <- data.frame(x = 1:10, y = letters[1:10])
The second dataframe z, has columns x1 and x2, which are lower and upper limits to filter d$x. This dataframe z may grow to be an arbitrary number of rows long.
z <- data.frame(x1 = c(1,3,8), x2 = c(1,4,10))
I want to return all rows of d for which d$x <= z$x1[i] and d$x >= z$x2[i] for all i, where i = nrow(z).
So for this toy example, exclude everything from 1:1, 3:4, 8:10, inclusive.
x y
2 2 b
5 5 e
6 6 f
7 7 g
We can create a sequence between x1 and x2 values and use anti_join to select rows from d that are not present in z.
library(tidyverse)
remove <- z %>%
mutate(x = map2(x1, x2, seq)) %>%
unnest(x) %>%
select(x)
anti_join(d, remove)
# x y
#1 2 b
#2 5 e
#3 6 f
#4 7 g
We can use a non-equi join
library(data.table)
i1 <- setDT(d)[z, .I, on = .(x >=x1, x <= x2), by = .EACHI]$I
i1
#[1] 1 3 4 8 9 10
d[i1]
# x y
#1: 1 a
#2: 3 c
#3: 4 d
#4: 8 h
#5: 9 i
#6: 10 j
d[!i1]
# x y
#1: 2 b
#2: 5 e
#3: 6 f
#4: 7 g
Or using fuzzyjoin
library(fuzzyjoin)
library(dplyr)
fuzzy_inner_join(d, z, by = c('x' = 'x1', 'x' = 'x2'),
match_fun = list(`>=`, `<=`)) %>%
select(names(d))
# A tibble: 6 x 2
# x y
# <int> <fct>
#1 1 a
#2 3 c
#3 4 d
#4 8 h
#5 9 i
#6 10 j
Or to get the rows not in 'x' from 'd'
fuzzy_anti_join(d, z, by = c('x' = 'x1', 'x' = 'x2'),
match_fun = list(`>=`, `<=`)) %>%
select(names(d))
# A tibble: 4 x 2
# x y
# <int> <fct>
#1 2 b
#2 5 e
#3 6 f
#4 7 g

ddply multifactorial all pairwise t-tests

How to perform a multifactorial t-test for all possible pairs of groups with a minimal number of coding lines.
My example:
3x features : 1,2,3
4x groups: : A,B,C,D
Aim: For each feature test all pairs of groups:
1(A-B,A-C,A-D,B-C,B-D,C-D)
2(A-B,A-C,A-D,B-C,B-D,C-D)
3(A-B,A-C,A-D,B-C,B-D,C-D)
= 18 T-tests
At the moment I am using ddply and inside lapply :
library(plyr)
groupVector <- c(rep("A",10),rep("B",10),rep("C",10),rep("D",10))
featureVector <- rep(1:3,each=40)
mydata <- data.frame(feature=factorVector,group=groupVector,value=rnorm(120,0,1))
ddply(mydata,.(feature),function(x){
grid <- combn(unique(x$group),2, simplify = FALSE)
df <- lapply(grid,function(p){
sub <- subset(x,group %in% p)
pval <- t.test(sub$value ~ sub$group)$p.value
data.frame(groupA=p[1],groupB=p[2],pval=pval)
})
res <- do.call("rbind",df)
return(res)
})
Here's my take, although it's arguable whether it's 'better'
split.data <- split(mydata, mydata$feature)
pairs <- as.data.frame(matrix(combn(unique(mydata$group), 2), nrow=2))
library(tidyverse)
map_df(split.data, function(x) map_df(pairs, function(y) tibble(groupA = y[1], groupB = y[2],
pval = t.test(value ~ group, data = x, subset = which(x$group %in% y))$p.value)), .id="feature")
Output
# # A tibble: 18 x 4
# feature groupA groupB pval
# <chr> <chr> <chr> <dbl>
# 1 1 A B 0.28452419
# 2 1 A C 0.65114472
# 3 1 A D 0.77746420
# 4 1 B C 0.42546791
# 5 1 B D 0.39876582
# 6 1 C D 0.88079645
# 7 2 A B 0.57843592
# 8 2 A C 0.30726571
# 9 2 A D 0.55457986
# 10 2 B C 0.74871464
# 11 2 B D 0.24017130
# 12 2 C D 0.04252878
# 13 3 A B 0.01355117
# 14 3 A C 0.08746756
# 15 3 A D 0.24527519
# 16 3 B C 0.15130684
# 17 3 B D 0.09172577
# 18 3 C D 0.64206517

How to count rows with conditional after grouping in data.table

I have the following data frame:
dat <- read_csv(
"s1,s2,v1,v2
a,b,10,20
a,b,22,NA
a,b,13,33
c,d,3,NA
c,d,4.5,NA
c,d,10,20"
)
dat
#> # A tibble: 6 x 4
#> s1 s2 v1 v2
#> <chr> <chr> <dbl> <int>
#> 1 a b 10.0 20
#> 2 a b 22.0 NA
#> 3 a b 13.0 33
#> 4 c d 3.0 NA
#> 5 c d 4.5 NA
#> 6 c d 10.0 20
What I want to do is
Filter row based on v1 values
Group by s1 and s2
Count total lines in every group
Count lines in every group where v2 is not NA.
For example with v1_filter >= 0 we get this:
s1 s2 total_line non_na_line
a b 3 2
c d 3 1
And with v1_filter >= 10 we get this:
s1 s2 total_line non_na_line
a b 2 1
c d 1 1
How can I achieve that with data.table or dplyr?
In reality we have around ~31M rows in dat. So we need
a fast method.
I'm stuck with this
library(data.table)
dat <- data.table(dat)
v1_filter = 0
dat[, v1 >= v1_filter,
by=list(s1,s2)]
Using sum should help. Operating on a logical vector, it treats each TRUE as 1 and FALSE as 0, so you can easily do this:
dat %>%
group_by(s1, s2) %>%
summarise(total_lines = n(),
non_na_line = sum(!is.na(v2)))
# A tibble: 2 x 4
# Groups: s1 [?]
s1 s2 total_lines non_na_line
<chr> <chr> <int> <int>
1 a b 3 2
2 c d 3 1
You'll easily be able to add in a filter between group_by and summarise, to get what you want. Keep in mind that summarise will only retain columns that you group by.
Benchmark
For what it's worth, I ran a quick benchmark, with some test data of similar size as yours.
s1charMix <- rep(letters[seq(from = 1, to = 10)], length.out = 30000000)
s2charMix <- rep(letters[seq(from = 11, to = 20)], length.out = 30000000)
s1chars <- sample(s1charMix, 30000000)
s2chars <- sample(s2charMix, 30000000)
v1Nums <- runif(30000000, min = 0, max = 20)
nomissing <- sample(1:200000,1)
int.mix <- rbinom(30000000 - nomissing, 30, 0.3)
nalist <- rep(NA, nomissing)
v2NumsNA <- sample(x = c(int.mix, nalist), 30000000)
df <- data_frame(s1 = s1chars, s2 = s2chars, v1 = v1Nums, v2 = v2NumsNA)
This should roughly replicate the size and type of the data you suggest:
df
# A tibble: 30,000,000 x 4
s1 s2 v1 v2
<chr> <chr> <dbl> <int>
1 d s 9.2123603 7
2 b q 16.6638639 11
3 g o 18.3682028 11
4 g s 0.8779067 9
5 a s 0.0719127 10
6 b q 16.8809193 12
7 h q 15.4382455 6
8 e k 2.3565489 11
9 h p 16.4508811 9
10 d n 2.7283823 11
# ... with 29,999,990 more rows
df %>%
filter(is.na(v2))
# A tibble: 116,924 x 4
s1 s2 v1 v2
<chr> <chr> <dbl> <int>
1 d r 13.1448988 NA
2 b o 0.2703848 NA
3 b t 18.8319385 NA
4 a s 11.6448437 NA
5 j m 0.5388760 NA
6 i k 8.7098427 NA
7 d s 6.1149735 NA
8 h p 2.5552694 NA
9 g r 0.9057442 NA
10 b s 19.8886830 NA
# ... with 116,914 more rows
Now, let's benchmark dplyr operations vs data.table:
### dplyr
df %>%
filter(v1 > 10) %>%
group_by(s1, s2) %>%
summarise(total_lines = n(),
non_na_line = sum(!is.na(v2)))
# A tibble: 100 x 4
# Groups: s1 [?]
s1 s2 total_lines non_na_line
<chr> <chr> <int> <int>
1 a k 150327 149734
2 a l 149655 149062
3 a m 149794 149200
4 a n 149771 149197
5 a o 149495 148942
...
> system.time(df %>% filter(v1 > 10) %>% group_by(s1, s2) %>% summarise(total_lines = n(), non_na_line = sum(!is.na(v2))))
user system elapsed
1.848 0.420 2.290
> system.time(for (i in 1:100) df %>% filter(v1 > 10) %>% group_by(s1, s2) %>% summarise(total_lines = n(), non_na_line = sum(!is.na(v2))))
user system elapsed
187.657 55.878 245.528
### Data.table
library(data.table)
dat <- data.table(df)
> dat[v1 > 10, .N, by = .(s1, s2)][dat[v1 > 10 & !is.na(v2), .N, by = .(s1, s2)] , on = c("s1", "s2") , nomatch = 0]
s1 s2 N i.N
1: b q 149968 149348
2: g o 150411 149831
3: h q 150132 149563
4: h p 150786 150224
5: e o 149951 149353
...
> system.time(dat[v1 > 10, .N, by = .(s1, s2)][dat[v1 > 10 & !is.na(v2), .N, by = .(s1, s2)] , on = c("s1", "s2") , nomatch = 0])
user system elapsed
2.027 0.228 2.271
> system.time(for (i in 1:100) dat[v1 > 10, .N, by = .(s1, s2)][dat[v1 > 10 & !is.na(v2), .N, by = .(s1, s2)] , on = c("s1", "s2") , nomatch = 0])
user system elapsed
213.281 43.949 261.664
TL;DR dplyr and data.table are similarly fast, if anything dplyr is slightly faster
> library(readr)
> dat <- read_csv(
+ "s1,s2,v1,v2
+ a,b,10,20
+ a,b,22,NA
+ a,b,13,33
+ c,d,3,NA
+ c,d,4.5,NA
+ c,d,10,20"
+ )
>
> dat
# A tibble: 6 x 4
s1 s2 v1 v2
<chr> <chr> <dbl> <int>
1 a b 10.0 20
2 a b 22.0 NA
3 a b 13.0 33
4 c d 3.0 NA
5 c d 4.5 NA
6 c d 10.0 20
Using data.table since you have a big data
> library(data.table)
data.table 1.10.4
The fastest way to learn (by data.table authors): https://www.datacamp.com/courses/data-analysis-the-data-table-way
Documentation: ?data.table, example(data.table) and browseVignettes("data.table")
Release notes, videos and slides: http://r-datatable.com
> dat=data.table(dat)
Without removing NA and keeping V1 filter as 0.1
> dat1=dat[v1>0.1,.N,.(s1,s2)]
> dat1
s1 s2 N
1: a b 3
2: c d 3
Removing v2 NA and keeping V1 filter as 0.1
> dat2=dat[v1>0.1&is.na(v2)==F,.N,.(s1,s2)]
> dat2
s1 s2 N
1: a b 2
2: c d 1
Merging the two and keeping V1 filter as 0
> dat[v1 > 0, .N, by = .(s1, s2)][ dat[v1 > 0 & !is.na(v2), .N, by = .(s1, s2)] , on = c("s1", "s2") , nomatch = 0 ]
s1 s2 N i.N
1: a b 3 2
2: c d 3 1

Resources