List merging using names and varnames - r

I would like to perform a left_join between elements of two lists, conditional on the fact that the lista share the same name (I cannot use a simple left_joinon the databases for memory reasons).
set.seed(0)
db_m1 <- data.frame(
y=rep(1,20),
id=sort(rep(paste0("id_",c(letters[1:4])),5)),
m1=rep(c(100,200),10),
x1=sample(LETTERS, 20, TRUE),
x2=sample(LETTERS, 20, TRUE)) %>%
split(f=list(.$y,.$m1))
set.seed(0)
db_m2 <- data.frame(y=rep(1,20),
m1=sample(seq(100,500,100),20,TRUE),
m2=sample(c(6:10),20,TRUE)) %>%
split(f=list(.$y,.$m1))
Desiderata:
$`1.100`
y id m1 x1 x2 m2
1 1 id_a 100 N O 10
2 1 id_a 100 N O 7
(...)
$`1.200`
y id m1 x1 x2 m2
7 1 id_a 200 Y U 6
8 1 id_a 200 Y U 9
(...)

keys <- unique(c(names(candidates_0_s)))
setNames(lapply(keys, function(key){
result <- left_join(candidates_0_s[[key]],allies_0_s[[key]])
return(result)}
),keys)

same idea, but in base R and working with your example:
common <- intersect(names(db_m1), names(db_m2))
merge_items <- function(item) merge(db_m1[[item]], db_m2[[item]], all.x = TRUE)
res <- lapply(common, merge_items)
output:
> lapply(res, head)
[[1]]
y m1 id x1 x2 m2
1 1 100 id_a N O 10
2 1 100 id_a N O 7
3 1 100 id_a N O 7
4 1 100 id_a N O 9
5 1 100 id_a N O 9
6 1 100 id_a N O 6
[[2]]
y m1 id x1 x2 m2
1 1 200 id_a Y U 6
2 1 200 id_a Y U 9
3 1 200 id_a Y U 9
4 1 200 id_a Y U 7
5 1 200 id_a G I 6
6 1 200 id_a G I 9

Related

Looking up value from table in R based on the column and row names

Currently I wish to lookup a value from below table in R. For example, if I execute
eval(parse("Tableg1004"))
I would get the value from column g and row 1004 which is "Y".
Table c g s o
1000 2 N N N
1001 3 Y N N
1002 5 Y Y N
1003 9 Y N N
1004 11 Y N N
1005 13 N N Y
How could I achieve this?
I don't think we should/can use eval parse expression here (see link above in comment from Ronak Shah). Ok - maybe we could. But we definitively shouldn't. What we could do instead is either join the 'lookup table' to the data:
lookup_table <- read.table(text="Table c g s o
1000 2 N N N
1001 3 Y N N
1002 5 Y Y N
1003 9 Y N N
1004 11 Y N N
1005 13 N N Y", header=T, stringsAsFactors=F)
data_table <- data.frame(Table = 1000:1007,
v1 = 1:8)
data_table_joined <- merge(data_table, lookup_table[, c("Table", "g")], by="Table", all.x=TRUE)
Table v1 g
1 1000 1 N
2 1001 2 Y
3 1002 3 Y
4 1003 4 Y
5 1004 5 Y
6 1005 6 N
7 1006 7 NA
8 1007 8 NA
Or we create a function that takes the value of the column to extract as a parameter:
look_up <- function(t, c) {
f <- function(t, c) {
r <- lookup_table[lookup_table$Table %in% t, c]
return(ifelse(is.null(r), NA, r))
}
return(unlist(mapply(FUN = f, t, c)))
}
We can use this function like this:
look_up(c(1005, 1005), c("g", "o"))
Or even like that:
library(dplyr)
data_table %>%
mutate(from_lookup = look_up(Table, "o"))
Table v1 from_lookup
1 1000 1 N
2 1001 2 N
3 1002 3 N
4 1003 4 N
5 1004 5 N
6 1005 6 Y
7 1006 7 <NA>
8 1007 8 <NA>

Getting the length of a list

I am attempting to decipher a list res which has structure as per below:
How would I go about converting this to a 21 (row) by 2 (column) dataframe?
I can do it by manually hard-coding the 21:
data.frame(matrix(unlist(res), nrow=21 ))
However I would like to use length(res) which unfortunately returns 1
As it is a list use [[ to index it to get the matrix and then convert to dataframe.
data.frame(res[[1]])
Or use unlist with recursive = FALSE
data.frame(unlist(res[[1]], recursive = FALSE))
Using a reproducble example,
res <- list(matrix(letters,ncol = 2))
data.frame(res[[1]])
# X1 X2
#1 a n
#2 b o
#3 c p
#4 d q
#5 e r
#6 f s
#7 g t
#8 h u
#9 i v
#10 j w
#11 k x
#12 l y
#13 m z
You can also magrittr::extract2
res %>% magrittr::extract2(1)
## A tibble: 21 x 2
# V1 V2
# <chr> <chr>
# 1 O M
# 2 W S
# 3 C Q
# 4 L C
# 5 M K
# 6 R M
# 7 U Q
# 8 I T
# 9 K J
#10 H V
## … with 11 more rows
or use purrr::flatten_dfc
purrr::flatten_dfc(res)
## A tibble: 21 x 2
# V1 V2
# <chr> <chr>
# 1 O M
# 2 W S
# 3 C Q
# 4 L C
# 5 M K
# 6 R M
# 7 U Q
# 8 I T
# 9 K J
#10 H V
## … with 11 more rows
Sample data
set.seed(2018)
res <- list(
as_tibble(matrix(sample(LETTERS, 21 * 2, replace = T), nrow = 21, ncol = 2))
)

Repeat data frame with varying date column

How can I repeat a data frame with varying date column at the end? If I apply one of the previously recommended ways, all the columns get repeated. For example:
df<-data.frame(x1=c(1:3), x2=c('z','g','h'), x3=c( rep( as.Date("2011-07-31"), by=1, len=3)) )
n=2
do.call("rbind", replicate(n, df, simplify = FALSE))
x1 x2 x3
1 1 z 2011-07-31
2 2 g 2011-07-31
3 3 h 2011-07-31
4 1 z 2011-07-31
5 2 g 2011-07-31
6 3 h 2011-07-31
Whereas what I need is:
x1 x2 x3
1 1 z 2011-07-31
2 2 g 2011-07-31
3 3 h 2011-07-31
4 1 z 2011-08-01
5 2 g 2011-08-01
6 3 h 2011-08-01
> n=2
> df1 <- df[rep(1:nrow(df), n),]
> transform(df1, x3=ave(x3, x1, FUN=function(x) x + 1:length(x) - 1L))
x1 x2 x3
1 1 z 2011-07-31
2 2 g 2011-07-31
3 3 h 2011-07-31
1.1 1 z 2011-08-01
2.1 2 g 2011-08-01
3.1 3 h 2011-08-01
or
> library(dplyr)
> df1 <- df[rep(1:nrow(df), n),]
> df1 %>% group_by(x1,x2) %>% mutate(x3= x3 + 1:n() - 1L)
Here is another base R method that works for your example.
# save result
dat <- do.call("rbind", replicate(n, df, simplify = FALSE))
# update x3 variable
dat$x3 <- dat$x3 + cumsum(dat$x1 == 1) - 1
The logic is that we use a cumulative sum that is incremented every time x1 returns to its initial value (here 1). We subtract 1 from the result as we don't want to alter the first block.
this returns
dat
x1 x2 x3
1 1 z 2011-07-31
2 2 g 2011-07-31
3 3 h 2011-07-31
4 1 z 2011-08-01
5 2 g 2011-08-01
6 3 h 2011-08-01
Using transform, this can be written
transform(dat, x3 = x3 + cumsum(x1 == 1) - 1)
As an alternative counting procedure, we could use seq_len together with rep like this
# update x3 variable
dat$x3 <- dat$x3 + rep(seq_len(n)-1L, each=nrow(df))

Getting the maximum common words in R

I have data of the form:
ID A1 A2 A3 ... A100
1 john max karl ... kevin
2 kevin bosy lary ... rosy
3 karl lary bosy ... hale
.
.
.
10000 isha john lewis ... dave
I want to get one ID for each ID such that both of them have maximum number of common attributes(A1,A2,..A100)
How can I do this in R ?
Edit: Let's call the output a MatchId:
ID MatchId
1 70
2 4000
.
.
10000 3000
I think this gets what you're looking for:
library(dplyr)
# make up some data
set.seed(1492)
rbind_all(lapply(1:15, function(i) {
x <- cbind.data.frame(stringsAsFactors=FALSE, i, t(sample(LETTERS, 10)))
colnames(x) <- c("ID", sprintf("A%d", 1:10))
x
})) -> dat
print(dat)
## Source: local data frame [15 x 11]
##
## ID A1 A2 A3 A4 A5 A6 A7 A8 A9 A10
## 1 1 H F E C B A R J Z N
## 2 2 Q P E M L Z C G V Y
## 3 3 Q J D N B T L K G Z
## 4 4 D Y U F V O I C A W
## 5 5 T Z D I J F R C B S
## 6 6 Q D H U P V O E R N
## 7 7 C L I M E K N S X Z
## 8 8 M J S E N O F Y X I
## 9 9 R H V N M T Q X L S
## 10 10 Q H L Y B W S M P X
## 11 11 M N J K B G S X V R
## 12 12 W X A H Y D N T Q I
## 13 13 K H V J D X Q W A U
## 14 14 M U F H S T W Z O N
## 15 15 G B U Y E L A Q W O
# get commons
rbind_all(lapply(1:15, function(i) {
rbind_all(lapply(setdiff(1:15, i), function(j) {
data.frame(id1=i,
id2=j,
common=length(intersect(c(t(dat[i, 2:11])),
c(t(dat[j, 2:11])))))
}))
})) -> commons
commons %>%
group_by(id1) %>%
top_n(1, common) %>%
filter(row_number()==1) %>%
select(ID=id1, MatchId=id2)
## Source: local data frame [15 x 2]
## Groups: ID
##
## ID MatchId
## 1 1 5
## 2 2 7
## 3 3 5
## 4 4 12
## 5 5 1
## 6 6 9
## 7 7 8
## 8 8 7
## 9 9 10
## 10 10 9
## 11 11 9
## 12 12 13
## 13 13 12
## 14 14 8
## 15 15 2
Using similar data as provided by #hrbrmstr
set.seed(1492)
dat <- do.call(rbind, lapply(1:15, function(i) {
x <- cbind.data.frame(stringsAsFactors=FALSE, i, t(sample(LETTERS, 10)))
colnames(x) <- c("ID", sprintf("A%d", 1:10))
x
}))
You could achieve the same using base R only
Res <- sapply(seq_len(nrow(dat)),
function(x) apply(dat[-1], 1,
function(y) length(intersect(dat[x, -1], y))))
diag(Res) <- -1
cbind(dat[1], MatchId = max.col(Res, ties.method = "first"))
# ID MatchId
# 1 1 5
# 2 2 7
# 3 3 5
# 4 4 12
# 5 5 1
# 6 6 9
# 7 7 8
# 8 8 7
# 9 9 10
# 10 10 9
# 11 11 9
# 12 12 13
# 13 13 12
# 14 14 8
# 15 15 2
If I understand correctly, the requirement is to obtain the maximum number of common attributes for each ID.
Frequency tables can be obtained using table() and recursively in lapply(), assuming that ID column is unique - slight modification is necessary if not (unique(df$ID) rather than df$ID in lapply()). The maximum frequencies can be taken and, if there is a tie, only the first one is chosen. Finally they are combined by do.call().
df <- read.table(header = T, text = "
ID A1 A2 A3 A100
1 john max karl kevin
2 kevin bosy lary rosy
3 karl lary bosy hale
10000 isha john lewis dave")
do.call(rbind, lapply(df$ID, function(x) {
tbl <- table(unlist(df[df$ID == x, 2:ncol(df)]))
data.frame(ID = x, MatchId = tbl[tbl == max(tbl)][1])
}))
# ID MatchId
#john 1 1
#kevin 2 1
#karl 3 1
#isha 10000 1

R: Euclidian distances between objects in a group

I want to create a matrix with similarities based on two identifiers, consider following matrix:
x1 <- c(2,2,2,3,1,2,4,6,4)
y1 <- c(5,4,3,3,4,2,1,6,3)
x2 <- c(8,2,7,3,1,2,2,2,6)
y2 <- c(1,3,3,3,1,2,4,3,8)
x3 <- c(4,4,1,2,4,6,3,2,9)
y3 <- c(1,2,3,3,1,2,4,6,1)
id1 <- c("a","a","a","a","b","b","b","b","b")
id2 <- c(2002,2002,2003,2003,2002,2002,2003,2003,2003)
dat <- data.frame(x1,y1,x2,y2,x3,y3,id1,id2)
For the groups marked by id1 and id2 I want to create the euclidean distance (sqrt((x1a-x1b)^2+(y1a-y1b)^2 + ... + (y3a-y3b)^2)) between the lines in the dataset. In the best case, there would be a new variable that indicates the distances of each line to each other line with the same id1 and id2. Please note that different numbers of members can be in each group as for instance in 2003 in the b-group there are three cases.
Any advice would be great!!!
I think it would be a good idea first to distinguish the lines whose distances you want to calculate. For example, for id1 == b and id2 == 2003 you have 3 lines, and you want to calculate 3 different distances (between each possible pair). So let's first assign each of these a unique id.
f <- function(n) {
# Returns a vector
# 1, 2, 1, 3, ..., 1, n, 2, 3, 2, 4, ..., 2, n, ..., (n-1), n
m <- matrix(ncol = 2, nrow = n * (n-1) / 2)
m[, 1] <- rep(1:(n-1), (n-1):1)
m[, 2] <- unlist(lapply(2:n, function(x) x:n))
as.numeric(t(m))
}
# Alternatively,
# f <- function(n) {
# d <- expand.grid(a = 1:n, b = 1:n)
# d <- d[d$a < d$b, ]
# unlist(d)
# }
# but this is slower
# Using plyr...
library(plyr)
dat <- ddply(dat, .(id1, id2), function(d) {
d <- d[f(nrow(d)), ]
d$id3 <- paste0(d$id1, rep(1:(nrow(d) / 2), each = 2))
d
})
# ...or using base R
dat <- do.call(rbind,
by(dat, list(dat$id1, dat$id2), function(d) {
d <- d[f(nrow(d)), ]
d$id3 <- paste0(d$id1, rep(1:(nrow(d) / 2), each = 2))
d
}))
Now there will only be two lines for each (id3, id2) pair and you can calculate the differences as follows
# Using plyr
result <- ddply(dat, .(id3, id2), function(d) {
d <- d[paste0(rep(c("x", "y"), 3), 1:3)]
d$dist <- sqrt(sum((d[1, ] - d[2, ])^2))
d
})
# Base R
result <- do.call(rbind,
by(dat[paste0(rep(c("x", "y"), 3), 1:3)],
list(dat$id3, dat$id2),
function(d){
d$dist <- sqrt(sum((d[1, ] - d[2, ])^2))
d
}
))
result[c("id3", "id2")] <- dat[c("id3", "id2")]
result
# x1 y2 x3 y1 x2 y3 dist id3 id2
# 1 2 1 4 5 8 1 6.480741 a1 2002
# 2 2 3 4 4 2 2 6.480741 a1 2002
# 5 1 1 4 4 1 1 3.464102 b1 2002
# 6 2 2 6 2 2 2 3.464102 b1 2002
# 3 2 3 1 3 7 3 4.242641 a1 2003
# 4 3 3 2 3 3 3 4.242641 a1 2003
# 7 4 4 3 1 2 4 5.916080 b1 2003
# 8 6 3 2 6 2 6 5.916080 b1 2003
# 7.1 4 4 3 1 2 4 9.000000 b2 2003
# 9 4 8 9 3 6 1 9.000000 b2 2003
# 8.1 6 3 2 6 2 6 11.313708 b3 2003
# 9.1 4 8 9 3 6 1 11.313708 b3 2003
Maybe this could be helpful.
dist(dat[which(dat[,"id1"]=="a" & dat[,"id2"]=="2002"),], method ="euclidean")
dist(dat[which(dat[,"id1"]=="b" & dat[,"id2"]=="2003"),], method ="euclidean")

Resources