Combine table with different elements - r

I have items in different lists and I want to count the item in each list and output it to a table. However, I ran into difficulty when there are different items in the list. Too illustrate my problem:
item_1 <- c("A","A","B")
item_2 <- c("A","B","B","B","C")
item_3 <- c("C","A")
item_4 <- c("D","A", "A")
item_5 <- c("B","D")
list_1 <- list(item_1, item_2, item_3)
list_2 <- list(item_4, item_5)
table_1 <- table(unlist(list_1))
table_2 <- table(unlist(list_2))
> table_1
A B C
4 4 2
> table_2
A B D
2 1 2
What I get from cbind is :
> cbind(table_1, table_2)
table_1 table_2
A 4 2
B 4 1
C 2 2
which is clearly wrong. What I need is:
table_1 table_2
A 4 2
B 4 1
C 2 0
D 0 2
Thanks in advance

It would probably be better to use factors at the start if possible, something like:
L <- list(list_1 = list_1,
list_2 = list_2)
RN <- unique(unlist(L))
do.call(cbind,
lapply(L, function(x)
table(factor(unlist(x), RN))))
# list_1 list_2
# A 4 2
# B 4 1
# C 2 0
# D 0 2
However, going with what you have, a function like the following might be useful. I've added comments to help explain what's happening in each step.
myFun <- function(..., fill = 0) {
## Get the names of the ...s. These will be our column names
CN <- sapply(substitute(list(...))[-1], deparse)
## Put the ...s into a list
Lst <- setNames(list(...), CN)
## Get the relevant row names
RN <- unique(unlist(lapply(Lst, names), use.names = FALSE))
## Create an empty matrix. `fill` can be anything--it's set to 0
M <- matrix(fill, length(RN), length(CN),
dimnames = list(RN, CN))
## Use match to identify the correct row to fill in
Row <- lapply(Lst, function(x) match(names(x), RN))
## use matrix indexing to fill in the unlisted values of Lst
M[cbind(unlist(Row),
rep(seq_along(Lst), vapply(Row, length, 1L)))] <-
unlist(Lst, use.names = FALSE)
## Return your matrix
M
}
Applied to your two tables, the outcome is like this:
myFun(table_1, table_2)
# table_1 table_2
# A 4 2
# B 4 1
# C 2 0
# D 0 2
Here's an example with adding another table to the problem. It also demonstrates use of NA as a fill value.
set.seed(1) ## So you can get the same results as me
table_3 <- table(sample(LETTERS[3:6], 20, TRUE) )
table_3
#
# C D E F
# 2 7 9 2
myFun(table_1, table_2, table_3, fill = NA)
# table_1 table_2 table_3
# A 4 2 NA
# B 4 1 NA
# C 2 NA 2
# D NA 2 7
# E NA NA 9
# F NA NA 2

To fix your existing problem, you can put the two tables into a list and add the missing values an names back in. Here, nm is a vector of the table names unique to each table, tbs is a list of the tables, and we can use sapply to append and reorder the missing values.
> nm <- unique(unlist(mget(paste("item", 1:5, sep = "_"))))
> tbs <- list(t1 = table_1, t2 = table_2)
> sapply(tbs, function(x) {
x[4] <- 0L
names(x)[4] <- nm[!nm %in% names(x)]
x[nm]
})
t1 t2
A 4 2
B 4 1
C 2 0
D 0 2
A general solution, for when you have unknowns, and so that you can keep NA values, is
> sapply(tbs, function(x) {
length(x) <- length(nm)
x <- x[match(nm, names(x))]
setNames(x, nm)
})
t1 t2
A 4 2
B 4 1
C 2 NA
D NA 2
But you could have avoided this entirely by going straight from items to table. You put the items into a list and then unlisted them in the very next step. There is a useNA argument in table that will keep the factor levels even when they're zero.
> t1 <- table(c(item_1, item_2, item_3), useNA = "always")
> t2 <- table(c(item_4, item_5), useNA = "always")
> table(c(item_4, item_5), useNA = "always")
A B D <NA>
2 1 2 0

A quick fix to your problem is to make the tables into data frames and then merge them:
d1 <- data.frame(value=names(table_1), table_1=as.numeric(table_1))
d2 <- data.frame(value=names(table_2), table_2=as.numeric(table_2))
merge(d1,d2, all=TRUE)
This will create NA's where you might want 0's. That can be fixed with
M <- merge(d1,d2, all=TRUE)
M[is.na(M)] <- 0

Related

Select values when column names are stored as concatenated strings

It's hard to explain, so I'll start with an example. I have some numeric columns (A, B, C). The column 'tmp' contains variable names of the numeric columns as concatenated strings:
set.seed(100)
A <- floor(runif(5, min=0, max=10))
B <- floor(runif(5, min=0, max=10))
C <- floor(runif(5, min=0, max=10))
tmp <- c('A','B,C','C','A,B','A,B,C')
df <- data.frame(A,B,C,tmp)
A B C tmp
1 3 4 6 A
2 2 8 8 B,C
3 5 3 2 C
4 0 5 3 A,B
5 4 1 7 A,B,C
Now, for each row, I want to use the variable names in tmp to select the values from the corresponding numeric columns with the same name(s). Then I want to keep only the rows where all the selected values are less than or equal 3.
E.g. in the first row, tmp is A, and the corresponding value in column A is 3, i.e. keep this row.
Another example, in row 4, tmp is A,B. The corresponding values are A = 0 and B = 5. Thus, all selected values are not less than or equal 3, and this row is discarded.
Desired result:
A B C tmp
1 3 4 6 A
2 5 3 2 C
How can I perform such filtering?
This is a bit more complicated than I like and there might be a more elegant solution, but here we go:
#split tmp
col <- strsplit(df[["tmp"]], ",")
#create an index matrix
inds <- do.call(rbind, Map(data.frame, row = seq_along(col), col = col))
inds$col <- match(inds$col, names(df))
inds <- as.matrix(inds)
#check
chk <- m <- as.matrix(df[, names(df) != "tmp"])
mode(chk) <- "logical"
chk[] <- NA
chk[inds] <- m[inds] <= 3
sel <- apply(chk, 1, prod, na.rm = TRUE)
df[as.logical(sel),]
# A B C tmp
#1 3 4 6 A
#3 5 3 2 C
Not sure if it works always (and probably isn't the best solution)... but it worked here:
library(dplyr)
library(tidyr)
library(stringr)
List= vector("list")
for (i in 1:length(df)){
tmpT= as.vector(str_split(df$tmp[i], ",", simplify=TRUE))
selec= df %>%
select(tmpT) %>%
slice(which(row_number() == i)) %>%
filter_all(., all_vars(. <= 3)) %>%
unite(val, sep= ", ")
if (nrow(selec) == 0) {
tab= NA
} else{
tab= df[i,]
}
List[[i]] = tab
}
df2= do.call("rbind", List)
This answer has some similarities with #Roland's, but here we work with the data in a 'longer' format:
# create row index
df$ri = seq_len(nrow(df))
# split the concatenated column
l <- strsplit(df$tmp, ',')
# repeat each row of the data with the lengths of the split string,
# bind with individual strings
d = cbind(df[rep(1:nrow(df), lengths(l)), ], x = unlist(l))
# use match to grab values from corresponding columns
d$val <- d[cbind(seq(nrow(d)), match(d$x, names(d)))]
# for each original row 'ri', check if all values are <= 3. use result to index data frame
d[as.logical(ave(d$val, d$ri, FUN = function(x) all(x <= 3))), ]
# A B C tmp ri x val
# 1 3 4 6 A 1 A 3
# 3 5 3 2 C 3 C 2

R lapply: check if data frame contains a column. If not, create this column

I have a list of dataframes.
I would like to check every column name of the dataframes. If the column name is missing, I want to create this column to the dataframe, and complete with NA values.
Dummy data:
d1 <- data.frame(a=1:2, b=2:3, c=4:5)
d2 <- data.frame(a=1:2, b=2:3)
l<-list(d1, d2)
# Check the columns names of the dataframes
# If column is missing, add new column, add NA as values
lapply(l, function(x) if(!("c" %in% colnames(x)))
{
c<-rep(NA, nrow(x))
cbind(x, c) # does not work!
})
What I get:
[[1]]
NULL
[[2]]
a b c
1 1 2 NA
2 2 3 NA
What I want instead:
[[1]]
a b c
1 1 2 4
2 2 3 5
[[2]]
a b c
1 1 2 NA
2 2 3 NA
Thanks for your help!
You could use dplyr::mutate with an ifelse:
library(dplyr)
lapply(l, function(x) mutate(x, c = ifelse("c" %in% names(x), c, NA)))
[[1]]
a b c
1 1 2 4
2 2 3 4
[[2]]
a b c
1 1 2 NA
2 2 3 NA
You have some good answers, but if you want to stick to base R:
lapply(l, function(x)
if(!("c" %in% colnames(x))) {
c<-rep(NA, nrow(x))
return(cbind(x, c))
}
else(return(x))
)
Your code was returning NULL for the first df because you had no else statement to handle the case of c existing (i.e FALSE in the if statement).
One way is to use dplyr::bind_rows to bind data.frames in the list and fill entries from missing columns with NA, and then split the resulting data.frame again to produce a list of data.frames:
df <- dplyr::bind_rows(l, .id = "id");
lapply(split(df, df$id), function(x) x[, -1])
#$`1`
# a b c
#1 1 2 4
#2 2 3 5
#
#$`2`
# a b c
#3 1 2 NA
#4 2 3 NA
Or the same as a tidyverse/magrittr chain
bind_rows(l, .id = "id") %>% split(., .$id) %>% lapply(function(x) x[, -1])
library(purrr)
map(l, ~{if(!length(.x$c)) .x$c <- NA; .x})

Change column names in list of dataframes

I have a list of dataframes on which I want to change the name of the 2nd column in each dataframe in the list so that it matches the name of the list item that holds it. The code that I have at the moment is:
my_list <- list(one = data.frame(a <- 1:5, b <- 1:5), two = data.frame(a <- 1:5, b <- 1:5))
my_list <- lapply(seq_along(names(my_list)), function(x) names(my_list[[x]])[2] <- names(my_list)[x])
but my code just replaces the dataframes without me understanding why. Any help would be much appreciated.
I know that I can do this easily with a "for" loop, but I would like to avoid it, hence my question.
setNames can be convenient here:
my_list2 <- lapply(
names(my_list),
function(x) setNames(my_list[[x]], c(names(my_list[[x]])[1], x))
)
Or the same using Map (which I think is easier to read):
my_list2 <- Map(
function(x, n) setNames(x, c(names(x)[1], n)),
my_list, names(my_list)
)
> my_list2
$one
a one
1 1 1
2 2 2
3 3 3
4 4 4
5 5 5
$two
a two
1 1 1
2 2 2
3 3 3
4 4 4
5 5 5
A problem of names<- is that it returns the name, not the object.
Try this, loop through data.frames, update column name:
# dummy list
my_list <- list(one = data.frame(a = 1:5, b = 1:5), two = data.frame(a = 1:5, b = 1:5))
my_list_updated <-
lapply(names(my_list), function(i){
x <- my_list[[ i ]]
# set 2nd column to a new name
names(x)[2] <- i
# return
x
})
my_list_updated
# [[1]]
# a one
# 1 1 1
# 2 2 2
# 3 3 3
# 4 4 4
# 5 5 5
#
# [[2]]
# a two
# 1 1 1
# 2 2 2
# 3 3 3
# 4 4 4
# 5 5 5
You could to use the super assignment operator:
my_list <- list(one = data.frame(a = 1:5, b = 1:5), two = data.frame(a = 1:5, b = 1:5))
lapply(seq_along(names(my_list)), function(x) names(my_list[[x]])[2] <<- names(my_list)[x] )
my_list

What is the most clean & efficient way of joining two tables (dataframes) in R?

I'm trying to find out the most efficient way of joining data from one dataframe into another. The idea is that I have a master data set (df) and a secondary dataset (lookup). I want to append the the data in the lookup table to the master data set.
Theoretical data as follows:
COLUMN_A <- 1:5
COLUMN_B <- 1:5
LOOKUP_COL <- letters[1:5]
df <- data.frame(COLUMN_A,COLUMN_B,LOOKUP_COL)
COLUMN_A COLUMN_B LOOKUP_COL
1 1 1 a
2 2 2 b
3 3 3 c
4 4 4 d
5 5 5 e
COLUMN_A <- 2*(1:5)
LOOKUP_COL <- letters[1:5]
SPARE_COL <- runif(5)
lookup <- data.frame(COLUMN_A,LOOKUP_COL,SPARE_COL)
COLUMN_A LOOKUP_COL SPARE_COL
1 1 a 0.6113499
2 2 b 0.3712987
3 3 c 0.3551038
4 4 d 0.6650248
5 5 e 0.2680611
This is how I've been doing it so far:
results <- merge(df,lookup,by='LOOKUP_COL')
Which provides me with:
LOOKUP_COL COLUMN_A.x COLUMN_B COLUMN_A.y SPARE_COL
1 a 1 1 1 0.6113499
2 b 2 2 2 0.3712987
3 c 3 3 3 0.3551038
4 d 4 4 4 0.6650248
5 e 5 5 5 0.2680611
So it seems that the entire lookup table has been merged into the master data, SPARE_COL is surplus to requirements - how can I control what columns get passed into the master data? Essentially, I'm trying to understand how the functionality of an excel vlookup can be used in R.
thanks
EDIT: This one uses SPARE_COL as the one to keep instead of COLUMN_A. If you have columns with the same name in different dataframes, the solution with indices will require that you rename them in one of the data frames before merging everything together.
Single column
You can do this by passing only the columns you want to merge to the function merge. Obviously you have to keep the columns used for merging in your selection. Taking your example, this becomes:
keep <- c('LOOKUP_COL','SPARE_COL')
results <- merge(df,lookup[keep],by='LOOKUP_COL')
And the result is
> results
LOOKUP_COL COLUMN_A COLUMN_B SPARE_COL
1 a 1 1 0.75670441
2 b 2 2 0.52122950
3 c 3 3 0.99338019
4 d 4 4 0.71904088
5 e 5 5 0.05405722
By selecting the columns first, you make merge work faster and you don't have to bother about finding the columns you want after the merge.
If speed is an issue and the merge is simple, you can speed things up by manually doing the merge using indices:
id <- match(df$LOOKUP_COL, lookup$LOOKUP_COL)
keep <- c('SPARE_COL')
results <- df
results[keep] <- lookup[id,keep, drop = FALSE]
This gives the same result, and gives a good speedup.
more columns
Let's create an example with 2 lookup columns first:
N <- 10000
COLUMN_A <- 1:N
COLUMN_B <- 1:N
LOOKUP_COL <- sample(letters[3:7], N, replace = TRUE)
LOOKUP_2 <- sample(letters[10:14], N, replace = TRUE)
df <- data.frame(COLUMN_A,COLUMN_B,LOOKUP_COL, LOOKUP_2)
COLUMN_A <- 2*(1:36)
LOOKUP_COL <- rep(letters[1:6], each = 6)
LOOKUP_2 <- rep(letters[10:15], times = 6)
SPARE_COL <- runif(36)
lookup <- data.frame(COLUMN_A,LOOKUP_COL, LOOKUP_2, SPARE_COL)
You can use merge again like this:
keep <- c('LOOKUP_COL','SPARE_COL', 'LOOKUP_2')
results <- merge(df,lookup[keep],by=c('LOOKUP_COL', 'LOOKUP_2'))
And you can use indices again. Before you match, you have to create the interaction between the lookup columns. You can do this using the function
interaction() for any number of lookup columns:
lookups <- c('LOOKUP_COL','LOOKUP_2')
id <- match(interaction(df[lookups]),
interaction(lookup[lookups]))
keep <- c('SPARE_COL')
results <- df
results[keep] <- lookup[id,keep, drop = FALSE]
Timing
In the test below the speedup is about a 6-fold for the two-column case:
test replications elapsed relative user.self sys.self user.child
1 code1() 100 6.30 6.117 6.30 0 NA
2 code2() 100 1.03 1.000 1.03 0 NA
sys.child
1 NA
2 NA
The code for testing:
N <- 10000
COLUMN_A <- 1:N
COLUMN_B <- 1:N
LOOKUP_COL <- sample(letters[3:7], N, replace = TRUE)
LOOKUP_2 <- sample(letters[10:14], N, replace = TRUE)
df <- data.frame(COLUMN_A,COLUMN_B,LOOKUP_COL, LOOKUP_2)
COLUMN_A <- 2*(1:36)
LOOKUP_COL <- rep(letters[1:6], each = 6)
LOOKUP_2 <- rep(letters[10:15], times = 6)
SPARE_COL <- runif(36)
lookup <- data.frame(COLUMN_A,LOOKUP_COL, LOOKUP_2, SPARE_COL)
code1 <- function(){
keep <- c('LOOKUP_COL','SPARE_COL', 'LOOKUP_2')
results <- merge(df,lookup[keep],by=c('LOOKUP_COL', 'LOOKUP_2'))
}
code2 <- function(){
lookups <- c('LOOKUP_COL','LOOKUP_2')
id <- match(interaction(df[lookups]),
interaction(lookup[lookups]))
keep <- c('SPARE_COL')
results <- df
results[keep] <- lookup[id,keep, drop = FALSE]
}
require(rbenchmark)
benchmark(code1(),code2())
For manipulating and merging dataframes, I suggest package dplyr:
library(dplyr)
df %>%
left_join(lookup, by=c("LOOKUP_COL")) %>%
select(LOOKUP_COL, COLUMN_A=COLUMN_A.x, COLUMN_B, COLUMN_C=COLUMN_A.y)

How do I preserve list names when working with nested lists?

The advantage of using plyr's llply over lapply is that it preserves list names. See ?llply for an explanation. I love that feature but cannot seem to get it to work in the case of a nested list. Example:
library(plyr)
m <- as.list(1:2)
names(m) <- c('M1', 'M2')
foo <- list(m, m)
names(foo) <- paste0("R", 1:2)
result <- ldply(foo, function(x){
ldply(x, function(z) { data.frame(a=z, b= z^2)})
})
> result
.id a b
1 M1 1 1
2 M2 2 4
3 M1 1 1
4 M2 2 4
# if I don't operate on the inner list, I can preserve the outer list's names
result2 <- ldply(foo, function(x){
data.frame(a = x[[1]], b = x[[1]]^2)
})
> result2
  .id a b
1  R1 1 1
2  R2 1 1
Note that result does not contain R1 and R2 (which would get added in as .id if I didn't operate on the nested list inside each element of foo as is the case with result2). How do I make sure I get the outer list names added in when working on nested lists?
It seems that the column name is the problem:
result <- ldply(foo, function(x){
df <- ldply(x, function(z) { data.frame(a=z, b= z^2)})
names(df)[1] <- ".id2"; df
})
result
.id .id2 a b
1 R1 M1 1 1
2 R1 M2 2 4
3 R2 M1 1 1
4 R2 M2 2 4
The problem is that ldply does not assign to the .id variable if there is already one. If you look at the results of one of your inner ldply once, it is fine:
> ldply(foo[[1]], function(z) { data.frame(a=z, b= z^2)})
.id a b
1 M1 1 1
2 M2 2 4
Rename it out of the way and it works as expected.
result <- ldply(foo, function(x){
rename(ldply(x, function(z) { data.frame(a=z, b= z^2)}),
c(".id" = ".id2"))
})
gives
> result
.id .id2 a b
1 R1 M1 1 1
2 R1 M2 2 4
3 R2 M1 1 1
4 R2 M2 2 4

Resources