Compute mean pairwise covariance between elements in a list - r

I have the following data frames:
# df1
id cg_v
1 a
2 b
3 a b
4 b c
5 b c d
6 d
# df2
id cg
1 a
2 b
3 a
3 b
4 b
4 c
5 b
5 c
5 d
6 d
I need to add a column to df1 that contains the mean covariance computed across each pair of elements in cg_v. If cg_v contains only one element, then I would like the new column to contain its variance.
I can get a covariance matrix by cov(crossprod(table(df2)))
# a b c d
a 0.9166667 0.0000000 -0.5833333 -0.6666667
b 0.0000000 2.0000000 1.0000000 0.0000000
c -0.5833333 1.0000000 0.9166667 0.3333333
d -0.6666667 0.0000000 0.3333333 0.6666667
What do I do from here?
The end result should be like this:
# df1
id cg_v cg_cov
1 a 0.9166667
2 b 2.0000000
3 a b 0.0000000
4 b c 1.0000000
5 b c d 0.4444444 # This is equal to (1.0000000 + 0.3333337 + 0.0000000)/3
6 d 0.6666667
Code to generate df1 and df2:
df1 <- structure(list(id = c(1L, 2L, 3L, 4L, 5L, 6L),
cg_v = c("a", "b", "a b", "b c", "b c d", "d")),
.Names = c("id", "cg_v"),
class = "data.frame", row.names = c(NA, -6L))
df2 <- structure(list(id = c(1L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 5L, 6L),
cg = c("a", "b", "a", "b", "b", "c", "b", "c", "d", "d")),
.Names = c("id", "cg"),
class = "data.frame", row.names = c(NA, -10L))

I think I found a solution for this problem using data.tables and reshape. What do you want to do with the three letters b c d? I assumed that you want to have the covariance of the first two letters:
require(reshape)
require(data.table)
dt1 <- data.table(id = c(1L, 2L, 3L, 4L, 5L, 6L),
cg_v = c("a", "b", "a b", "b c", "b c d", "d"))
dt2 <- data.table(id = c(1L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 5L, 6L),
cg = c("a", "b", "a", "b", "b", "c", "b", "c", "d", "d"))
cov_dt <- data.table(melt(cov(crossprod(table(df2)))))
dt1 <- cbind(dt1, t(sapply(strsplit(as.character(df1$cg_v), " "), function(x)x[1:2])))
#replace the na with the first colomn
dt1[is.na(V2), V2 := V1]
# Merge them on two columns
setkey(dt1, "V1", "V2")
setkey(cov_dt, "X1", "X2")
result <- cov_dt[dt1]
> result[,.(id, cg_v, value)]
id cg_v value
1: 1 a 0.9166667
2: 3 a b 0.0000000
3: 2 b 2.0000000
4: 4 b c 1.0000000
5: 5 b c d 1.0000000
6: 6 d 0.6666667
Variant which also works if there are more than 2 letters (not the most efficient code):
require(reshape)
require(combinat)
df1 <- data.frame(id = c(1L, 2L, 3L, 4L, 5L, 6L),
cg_v = c("a", "b", "a b", "b c", "b c d", "d"))
df2 <- data.frame(id = c(1L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 5L, 6L),
cg = c("a", "b", "a", "b", "b", "c", "b", "c", "d", "d"))
cov_dt <- cov(crossprod(table(df2)))
mat <- sapply(strsplit(as.character(df1$cg_v), " "), function(x) if(length(x) == 1){c(x,x)} else(x))
# Should be all minimal 2
sapply(mat, length) > 1
mat <- sapply(mat, function(x) matrix(combn(x,2), nrow = 2))
df1$cg_cov <- sapply(mat, function(x) mean(apply(x,2, function(x) cov_dt[x[1],x[2]])))
> df1
id cg_v cg_cov
1 1 a 0.9166667
2 2 b 2.0000000
3 3 a b 0.0000000
4 4 b c 1.0000000
5 5 b c d 0.4444444
6 6 d 0.6666667

Related

How to move two specific rows to top of dataframe?

Below I have a DF.
A B C D
a 4 2 2
g 5 2 2
d 7 65 7
e 3 6 7
I would like to make this DF so that column A has "g" in the first row, and "d" in the second row. I would like to do this by calling the value in column A (rather than an index). How can I do this?
Ideal output
A B C D
g 5 2 2
d 7 65 7
a 4 2 2
e 3 6 7
We may convert to factor with levels specified in an order before arrangeing
library(forcats)
library(dplyr)
DF %>%
arrange(fct_relevel(A, 'g', 'd'))
A B C D
1 g 5 2 2
2 d 7 65 7
3 a 4 2 2
4 e 3 6 7
with fct_relevel, we can specify the order of specific levels without specifying the rest of the levels
> with(DF, fct_relevel(A, 'g', 'd'))
[1] a g d e
Levels: g d a e
data
DF <- structure(list(A = c("a", "g", "d", "e"), B = c(4L, 5L, 7L, 3L
), C = c(2L, 2L, 65L, 6L), D = c(2L, 2L, 7L, 7L)), class = "data.frame",
row.names = c(NA,
-4L))
Another possible solution:
library(dplyr)
df <- data.frame(
stringsAsFactors = FALSE,
A = c("a", "g", "d", "e"),
B = c(4L, 5L, 7L, 3L),
C = c(2L, 2L, 65L, 6L),
D = c(2L, 2L, 7L, 7L)
)
df %>% arrange(match(A, c("g", "d", setdiff(c("g", "d"), A))))
#> A B C D
#> 1 g 5 2 2
#> 2 d 7 65 7
#> 3 a 4 2 2
#> 4 e 3 6 7
Try the code below
with(
df,
df[match(c("g","d",A[!A%in%c("g","d")]),A),]
)
and you will see
A B C D
2 g 5 2 2
3 d 7 65 7
1 a 4 2 2
4 e 3 6 7
Just to add a base R solution if you are not interested in external packages, you can specify the row order directly:
# Sample Data
DF <- structure(list(A = c("a", "g", "d", "e"), B = c(4L, 5L, 7L, 3L
), C = c(2L, 2L, 65L, 6L), D = c(2L, 2L, 7L, 7L)), class = "data.frame",
row.names = c(NA, -4L))
A hard code for this example:
DF2 <- DF[c(2,3,1,4),]
A more generalizable example:
# specify desired rows
rownums <- which(DF$A %in% c("g","d"), arr.ind = TRUE)
# Specify other rows
otherrows <- seq(1:nrow(DF))[!(seq(1:nrow(DF)) %in% rownums)]
# Organize
DF2 <- DF[c(rownums,otherrows),]

R: Applying a function on data frame columns defined in another table

I have a dataframe (df) that looks like this:
A B C D E F G H
a 1 2 3 4 5 3 4 2
b 3 4 5 5 4 5 5 4
c 1 4 6 7 9 6 7 4
d 2 4 5 7 8 5 7 4
e 2 2 4 5 7 4 5 2
I would like to compute the ratio of different columns, based on a table like this, that indicates which column needs to be the numerator and denominator:
num denom
A E
B G
F C
H D
So for example the first column in the output will be column A divided by column E.
The output would look like this (I want to keep the column names of the numerators):
A B F H
a 0.2 0.5 1 0.5
b 0.75 0.8 1 0.8
c 0.111111111 0.571428571 1 0.571428571
d 0.25 0.571428571 1 0.571428571
e 0.285714286 0.4 1 0.4
I hope I am making this understandable, here is the dput of the original table:
structure(list(A = c(1L, 3L, 1L, 2L, 2L), B = c(2L, 4L, 4L, 4L,
2L), C = c(3L, 5L, 6L, 5L, 4L), D = c(4L, 5L, 7L, 7L, 5L), E = c(5L,
4L, 9L, 8L, 7L), F = c(3L, 5L, 6L, 5L, 4L), G = c(4L, 5L, 7L,
7L, 5L), H = c(2L, 4L, 4L, 4L, 2L)), .Names = c("A", "B", "C",
"D", "E", "F", "G", "H"), class = "data.frame", row.names = c("a",
"b", "c", "d", "e"))
Any help will be much appreciated! Thanks
Even easier with dplyr::transmute()
library(dplyr)
df %>% transmute(A=A/E, B=B/G, F=F/C, H=H/D)
# A B F H
#1 0.2000000 0.5000000 6.00 0.5000000
#2 0.7500000 0.8000000 6.25 0.8000000
#3 0.1111111 0.5714286 10.50 0.5714286
#4 0.2500000 0.5714286 8.75 0.5714286
#5 0.2857143 0.4000000 10.00 0.4000000
Here we go:
mdf <- data.frame(num=c("A", "B", "F", "H"), denom=c("E", "G", "C", "D"), stringsAsFactors = F)
df_num <- df[, mdf$num]
df_denom <- df[, mdf$denom]
df_num/df_denom
The stringAsFactors = Fis important...

Turning list into a data.frame

mylist <- list(structure(c(1L, 1L, 2L, 2L, 2L, 2L, NA, NA), .Names = c("A",
"B", "C", "D", "E", "F", "G", "H")), structure(c(1L, 1L, 1L,
1L, 1L, 2L, 1L, NA), .Names = c("A", "B", "C", "D", "E", "F",
"G", "H")))
mylist
[[1]]
A B C D E F G H
1 1 2 2 2 2 NA NA
[[2]]
A B C D E F G H
1 1 1 1 1 2 1 NA
I have a list like above and I want to collapse it into a data.frame so that I can subset each column individually ie df$A, df$B, etc.
> df$A
[1] 1 1
> df$B
[1] 1 1
> df$C
[1] 2 1
And so forth
You could unlist and the split according to the names, something like
temp <- unlist(mylist)
res <- split(unname(temp), names(temp))
# res$A
# [1] 1 1
# res$B
# [1] 1 1
# res$C
# [1] 2 1

How to collapse session path data into from-to paths for visualizing network data?

What are some ways to transform session path data such as this:
df
# Session Link1 Link2 Link3 Link4 Link5
# 1 1 A B
# 2 2 C
# 3 3 D A B
# 4 4 C F G H J
# 5 5 A B C
Into a data set that looks like this:
desired
# Session From To
# 1 1 A B
# 2 2 C <NA>
# 3 3 D A
# 4 3 A B
# 5 4 C F
# 6 4 F G
# 7 4 G H
# 8 4 H J
# 9 5 A B
# 10 5 B C
Data for reproducibility:
df <- structure(list(Session = 1:5, Link1 = structure(c(1L, 2L, 3L, 2L, 1L), .Label = c("A", "C", "D"), class = "factor"), Link2 = structure(c(3L, 1L, 2L, 4L, 3L), .Label = c("", "A", "B", "F"), class = "factor"), Link3 = structure(c(1L, 1L, 2L, 4L, 3L), .Label = c("", "B", "C", "G"), class = "factor"), Link4 = structure(c(1L, 1L, 1L, 2L, 1L), .Label = c("", "H"), class = "factor"), Link5 = structure(c(1L, 1L, 1L, 2L, 1L), .Label = c("", "J"), class = "factor")), .Names = c("Session", "Link1", "Link2", "Link3", "Link4", "Link5"), class = "data.frame", row.names = c(NA, -5L))
desired <- structure(list(Session = c(1L, 2L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L), From = structure(c(1L, 3L, 4L, 1L, 3L, 5L, 6L, 7L, 1L, 2L), .Label = c("A", "B", "C", "D", "F", "G", "H"), class = "factor"), To = structure(c(2L, NA, 1L, 2L, 4L, 5L, 6L, 7L, 2L, 3L), .Label = c("A", "B", "C", "F", "G", "H", "J"), class = "factor")), .Names = c("Session", "From", "To"), class = "data.frame", row.names = c(NA, -10L))
We could use data.table. Convert the 'data.frame' to 'data.table' (setDT(df)). Reshape from 'wide' to 'long' format with melt specifying the id.var as 'Session'. Remove the 'value' elements that are empty [value!='']. Grouped by 'Session', we insert 'NA' values in the 'value' column for those 'Session' that have only a single row (if...else), create a two columns ('From' and 'To') by removing the last and first element of 'V1' grouped by 'Session'.
library(data.table)#v1.9.5+
melt(setDT(df), id.var='Session')[value!=''][,
if(.N==1L) c(value, NA) else value, by = Session][,
list(From=V1[-.N], To=V1[-1L]), by = Session]
# Session From To
#1: 1 A B
#2: 2 C NA
#3: 3 D A
#4: 3 A B
#5: 4 C F
#6: 4 F G
#7: 4 G H
#8: 4 H J
#9: 5 A B
#10: 5 B C
The above could be simplified to a single block after the melt step. For some reason, tmp[-.N] is not working. So I used tmp[1:(.N-1)].
melt(setDT(df), id.var= 'Session')[value!='', {
tmp <- if(.N==1L) c(value, NA) else value
list(From= tmp[1:(.N-1)], To= tmp[-1L]) }, by = Session]
# Session From To
#1: 1 A B
#2: 2 C NA
#3: 3 D A
#4: 3 A B
#5: 4 C F
#6: 4 F G
#7: 4 G H
#8: 4 H J
#9: 5 A B
#10: 5 B C
Inspired by #akrun, this is my personal stab at the problem. Granted, the results are tweaked to include the terminal from-to path for each pair:
library(dplyr)
library(tidyr)
gather(df, "Link_Num", "Value", -Session) %>%
group_by(Session) %>%
mutate(to = Value,
from = lag(to)) %>%
filter(Link_Num != "Link1" &
from != "") %>%
select(Session, from, to, Link_Num) %>%
arrange(Session)
Which yields:
Session from to Link_Num
1 1 A B Link2
2 1 B Link3
3 2 C Link2
4 3 D A Link2
5 3 A B Link3
6 3 B Link4
7 4 C F Link2
8 4 F G Link3
9 4 G H Link4
10 4 H J Link5
11 5 A B Link2
12 5 B C Link3
13 5 C Link4
Another approach with dplyr functions melt and lead:
library(dplyr)
df$spacer <- ""
df %>% melt(id.var = "Session") %>%
arrange(Session) %>%
mutate(To = lead(value)) %>%
filter(To !="" & value !="" | To =="" & variable =="Link1") %>%
mutate(To = ifelse(To == "", NA, To)) %>% select(-variable)
# Session value To
# 1 1 A B
# 2 2 C <NA>
# 3 3 D A
# 4 3 A B
# 5 4 C F
# 6 4 F G
# 7 4 G H
# 8 4 H J
# 9 5 A B
# 10 5 B C

loop or function to create new cols and fill based on lists

I would like to write a function or loop that will create three new columns, then fill these columns with either the same value or a specified value, if the value in the original column is within one of three specified lists.
For example, here is what the data looks like:
> data
a1 a2 a3
1 C C A
2 A B_20 B_20
3 A C B_30
4 C C B_40
5 C A A
6 B_60 B_60 B_60
7 A A C
8 A C B_80
9 B_90 C B_90
I want to create three new columns (a1_t, a2_t, a3_t) where if a1 is in list1
list1 <-c('B_10','B_20','B_30')
then fill in a1_t, with B_00_30
or if a1 is in list2
list2 <-c('B_40','B_50','B_60')
then fill in a1_t, with B_40_60
or if a1 is in list3
list3 <-c('B_70','B_80','B_90')
then fill in a1_t, with B_70_90
if not in list1, list2, or list3, then place the value from a1 to a1_t.
Then iterate through this same procedure for a2_t and a3_t using a2 and a3 for matching.
In the end I would like the output to look like this:
> data
a1 a2 a3 a1_t a2_t a3_t
1 A A B_10 A A B_00_30
2 B_20 A C B_00_30 A C
3 B_30 A C B_00_30 A C
4 C C A C C A
5 A B_50 B_50 A B_40_60 B_40_60
6 C C A C C A
7 C B_70 A C B_70_90 A
8 B_80 C B_80 B_70_90 C B_70_90
9 B_90 C A B_70_90 C A
To create original raw data:
data <- structure(list(a1 = c("A", "B_20", "B_30", "C", "A", "C", "C",
"B_80", "B_90"), a2 = c("A", "A", "A", "C", "B_50", "C", "B_70",
"C", "C"), a3 = c("B_10", "C", "C", "A", "B_50", "A", "A", "B_80",
"A")), class = "data.frame", .Names = c("a1", "a2", "a3"), row.names = c(NA,
-9L))
To create desired output data:
data <- structure(list(a1 = structure(c(1L, 2L, 3L, 6L, 1L, 6L, 6L, 4L, 5L), .Label = c("A", "B_20", "B_30", "B_80", "B_90", "C"), class = "factor"),
a2 = structure(c(1L, 1L, 1L, 4L, 2L, 4L, 3L, 4L, 4L), .Label = c("A", "B_50", "B_70", "C"), class = "factor"),
a3 = structure(c(2L, 5L, 5L, 1L, 3L, 1L, 1L, 4L, 1L), .Label = c("A", "B_10", "B_50", "B_80", "C"), class = "factor"),
a1_t = structure(c(1L, 2L, 2L, 4L, 1L, 4L, 4L, 3L, 3L), .Label = c("A", "B_00_30", "B_70_90", "C"), class = "factor"),
a2_t = structure(c(1L, 1L, 1L, 4L, 2L, 4L, 3L, 4L, 4L), .Label = c("A", "B_40_60", "B_70_90", "C"), class = "factor"),
a3_t = structure(c(2L, 5L, 5L, 1L, 3L, 1L, 1L, 4L, 1L), .Label = c("A", "B_00_30", "B_40_60", "B_70_90", "C"), class = "factor")),
.Names = c("a1", "a2", "a3", "a1_t", "a2_t", "a3_t"), class = "data.frame", row.names = c(NA, -9L))
Thanks
-al
Final working code w/ answer:
library(dplyr)
list1 <-c('B_10','B_20','B_30')
list2 <-c('B_40','B_50','B_60')
list3 <-c('B_70','B_80','B_90')
lookup = rbind(cbind(list = list1, val = "B_00_30"),
cbind(list2, "B_40_60"),
cbind(list3, "B_70_90"))
g <- sapply(data, function(x) {
tmp = lookup[, 2][match(x, lookup[, 1])]
ifelse(is.na(tmp), x, tmp)
})
gd <- as.data.frame (g)
gd <- mutate (gd,a1_t=a1,a2_t=a2,a3_t=a3)
gd <- select (gd,a1_t,a2_t,a3_t)
h <- cbind (data,gd)
> h
a1 a2 a3 a1_t a2_t a3_t
1 A A B_10 A A B_00_30
2 B_20 A C B_00_30 A C
3 B_30 A C B_00_30 A C
4 C C A C C A
5 A B_50 B_50 A B_40_60 B_40_60
6 C C A C C A
7 C B_70 A C B_70_90 A
8 B_80 C B_80 B_70_90 C B_70_90
9 B_90 C A B_70_90 C A
A way could be:
lookup = rbind(cbind(list = list1, val = "B_00_30"),
cbind(list2, "B_40_60"),
cbind(list3, "B_70_90"))
sapply(data, function(x) {
tmp = lookup[, 2][match(x, lookup[, 1])]
ifelse(is.na(tmp), x, tmp)
})
# a1 a2 a3
# [1,] "A" "A" "B_00_30"
# [2,] "B_00_30" "A" "C"
# [3,] "B_00_30" "A" "C"
# [4,] "C" "C" "A"
# [5,] "A" "B_40_60" "B_40_60"
# [6,] "C" "C" "A"
# [7,] "C" "B_70_90" "A"
# [8,] "B_70_90" "C" "B_70_90"
# [9,] "B_70_90" "C" "A"
Then you can cbind to "data" and coerce to "data.frame" as needed.
Another way using cut
indx <- cut(as.numeric(gsub(".\\_","",as.matrix(data))),breaks=c(0,30,60,90),labels=F)
(Here, you will get a warning message because as.numeric on those elements that are characters will coerce them to NAs, which was my intention.)
or using info from list1:list3
val <- sapply(mget(ls(pattern="list")),function(x) max(as.numeric(gsub("._","",x))))
val
# list1 list2 list3
# 30 60 90
#indx <- cut(as.numeric(gsub(".\\_","",as.matrix(data))),breaks=c(0,val),labels=F)
indx[!is.na(indx)] <- c("B_00_30","B_40_60", "B_70_90")[indx[!is.na(indx)]]
indx[is.na(indx)] <- unlist(data)[!grepl("_", unlist(data))]
data1 <- data
data1[] <- indx
colnames(data1) <- paste(colnames(data1),"t",sep="_")
Update
To avoid the warning message, you could do:
m1 <- as.matrix(data)
indx <- grepl("\\d",gsub(".\\_","",m1))
indx1 <- cut(as.numeric(gsub(".\\_","",m1[indx])),breaks=c(0,30,60,90),labels=F)
m1[indx] <- c("B_00_30", "B_40_60", "B_70_90")[indx1]
data1 <- data
data1[] <- m1
colnames(data1) <- paste(colnames(data1),"t",sep="_")
cbind(data, data1)
# a1 a2 a3 a1_t a2_t a3_t
# 1 A A B_10 A A B_00_30
# 2 B_20 A C B_00_30 A C
# 3 B_30 A C B_00_30 A C
# 4 C C A C C A
# 5 A B_50 B_50 A B_40_60 B_40_60
# 6 C C A C C A
# 7 C B_70 A C B_70_90 A
# 8 B_80 C B_80 B_70_90 C B_70_90
# 9 B_90 C A B_70_90 C A

Resources