Matching two list of unequal length - r

I am trying to match the values in 2 lists only where the variable names are the same between list. I would like the result to be a list the length of the longer list filled with count of total matches.
jac <- structure(list(s1 = "a", s2 = c("b", "c", "d"), s3 = 5),
.Names = c("s1", "s2", "s3"))
larger <- structure(list(s1 = structure(c(1L, 1L, 1L), .Label = "a", class = "factor"),
s2 = structure(c(2L, 1L, 3L), .Label = c("b", "c", "d"), class = "factor"),
s3 = c(1, 2, 7)), .Names = c("s1", "s2", "s3"), row.names = c(NA, -3L), class = "data.frame")
I am using mapply(FUN = pmatch, jac, larger) which gives me a correct total but not in the format that I would like below:
s1 s2 s3 s1result s2result s3result
a c 1 1 2 NA
a b 2 1 1 NA
a c 7 1 3 NA
However, I don't think pmatch will ensure the name matching in every situation so I wrote a function that I am still having issues with:
prodMatch <- function(jac,larger){
for(i in 1:nrow(larger)){
if(names(jac)[i] %in% names(larger[i])){
r[i] <- jac %in% larger[i]
r
}
}
}
Can anyone help out?
Another dataset that causes one to not be a multiple of the ohter:
larger2 <-
structure(list(s1 = structure(c(1L, 1L, 1L), class = "factor", .Label = "a"),
s2 = structure(c(1L, 1L, 1L), class = "factor", .Label = "c"),
s3 = c(1, 2, 7), s4 = c(8, 9, 10)), .Names = c("s1", "s2",
"s3", "s4"), row.names = c(NA, -3L), class = "data.frame")

mapply returns a list of matching index, you can convert it to a data frame simply using as.data.frame:
as.data.frame(mapply(match, jac, larger))
# s1 s2 s3
# 1 1 2 NA
# 2 1 1 NA
# 3 1 3 NA
And cbind the result with larger gives what you expected:
cbind(larger,
setNames(as.data.frame(mapply(match, jac, larger)),
paste(names(jac), "result", sep = "")))
# s1 s2 s3 s1result s2result s3result
#1 a c 1 1 2 NA
#2 a b 2 1 1 NA
#3 a d 7 1 3 NA
Update: To take care of the cases where the name of the two lists don't match, we can loop through the larger and it's name simultaneously and extract the elements from jac as follows:
as.data.frame(
mapply(function(col, name) {
m <- match(jac[[name]], col)
if(length(m) == 0) NA else m # if the name doesn't exist in jac return NA as well
}, larger, names(larger)))
# s1 s2 s3
#1 1 2 NA
#2 1 1 NA
#3 1 3 NA

Related

How to extract a column based on column name?

I have a data frame df
m n o p
a 1 1 2 5
b 1 2 0 4
c 3 3 3 3
I can extract column m by:
df[,"m"]
Now the problem is, the column name was generated somewhere else (multiple times, in a for loop). For example, column name m was generated by choosing a specific element in the dataframe, gen, in one loop
:
> gen[i,1]
[1] m
How do I extract the column based on gen[i,1]?
Just nest the subsetting.
dat[,"m"]
# [1] 1 1 3
i <- 13
gen[i, 1]
# [1] "m"
dat[, gen[i, 1]]
# [1] 1 1 3
Or, if you don't want the column to be dropped:
dat[, gen[i, 1], drop=FALSE]
# m
# a 1
# b 1
# c 3
Data
dat <- structure(list(m = c(1L, 1L, 3L), n = 1:3, o = c(2L, 0L, 3L),
p = 5:3), class = "data.frame", row.names = c("a", "b", "c"
))
gen <- data.frame(letters)
We can use select from dplyr
library(dplyr)
i <- 13
dat %>%
select(gen[i, 1])
# m
#a 1
#b 1
#c 3
data
dat <- structure(list(m = c(1L, 1L, 3L), n = 1:3, o = c(2L, 0L, 3L),
p = 5:3), class = "data.frame", row.names = c("a", "b", "c"
))
gen <- data.frame(letters)

Replace a subset of data frame

I have a data frame with some error
T item V1 V2
1 a 2 .1
2 a 5 .8
1 b 1 .7
2 b 2 .2
I have another data frame with corrections for items concerning V1 only
T item V1
1 a 2
2 a 6
How do I get the final data frame? Should I use merge or rbind. Note: actual data frames are big.
An option would be a data.table join on the 'T', 'item' and assigning the 'V1' with the the corresponding 'V1' column (i.V1) from the second dataset
library(data.table)
setDT(df1)[df2, V1 := i.V1, on = .(T, item)]
df1
# T item V1 V2
#1: 1 a 2 0.1
#2: 2 a 6 0.8
#3: 1 b 1 0.7
#4: 2 b 2 0.2
data
df1 <- structure(list(T = c(1L, 2L, 1L, 2L), item = c("a", "a", "b",
"b"), V1 = c(2L, 5L, 1L, 2L), V2 = c(0.1, 0.8, 0.7, 0.2)),
class = "data.frame", row.names = c(NA, -4L))
df2 <- structure(list(T = 1:2, item = c("a", "a"), V1 = c(2L, 6L)),
class = "data.frame", row.names = c(NA,
-2L))
This should work -
library(dplyr)
df1 %>%
left_join(df2, by = c("T", "item")) %>%
mutate(
V1 = coalesce(as.numeric(V1.y), as.numeric(V1.x))
) %>%
select(-V1.x, -V1.y)

R - reduce with merge and more than 2 suffixes (or: how to merge multiple dataframes and keep track of columns)

I'm trying to merge 4 dataframes based on 2 columns, but keep track of which dataframe a column originated from. I'm running into an issue at tracking the columns.
(see end of post of dput(dfs))
#df example (df1)
Name Color Freq
banana yellow 3
apple red 1
apple green 4
plum purple 8
#create list of dataframes
list.df <- list(df1, df2, df3, df4)
#merge dfs on column "Name" and "Color"
combo.df <- Reduce(function(x,y) merge(x,y, by = c("Name", "Color"), all = TRUE, accumulate=FALSE, suffixes = c(".df1", ".df2", ".df3", ".df4")), list.df)
This gives the following warning:
Warning message:
In merge.data.frame(x, y, by = c("Name", "Color"), all = TRUE, :
column names ‘Freq.df1’, ‘Freq.df2’ are duplicated in the result
and outputs this dataframe:
#combo df example
Name Color Freq.df1 Freq.df2 Freq.df1 Freq.df2
banana yellow 3 3 7 NA
apple red 1 2 9 1
apple green 4 NA 8 2
plum purple 8 1 NA 6
df1 and df2 are only repeated in name. The values populating the third and fourth column of combo are actually from df3 and df4 respectively.
What I would really like is:
Name Color Freq.df1 Freq.df2 Freq.df3 Freq.df4
banana yellow 3 3 7 NA
apple red 1 2 9 1
apple green 4 NA 8 2
plum purple 8 1 NA 6
How can I achieve this? I know the merge(..., suffixes) function can only handle a character vector of 2, but I don't know what the work around should be. Thanks!
df1 <-
structure(list(Name = structure(c(2L, 1L, 1L, 3L), .Label = c("apple",
"banana", "plum"), class = "factor"), Color = structure(c(4L,
3L, 1L, 2L), .Label = c("green", "purple", "red", "yellow"), class = "factor"),
Freq = c(3, 1, 4, 8)), .Names = c("Name", "Color", "Freq"
), row.names = c(NA, -4L), class = "data.frame")
df2 <-
structure(list(Name = structure(c(2L, 1L, 3L), .Label = c("apple",
"banana", "plum"), class = "factor"), Color = structure(c(3L,
2L, 1L), .Label = c("purple", "red", "yellow"), class = "factor"),
Freq = c(3, 2, 1)), .Names = c("Name", "Color", "Freq"), row.names = c(NA,
-3L), class = "data.frame")
df3 <-
structure(list(Name = structure(c(2L, 1L, 1L), .Label = c("apple",
"banana"), class = "factor"), Color = structure(c(3L, 2L, 1L), .Label = c("green",
"red", "yellow"), class = "factor"), Freq = c(7, 9, 8)), .Names = c("Name",
"Color", "Freq"), row.names = c(NA, -3L), class = "data.frame")
df4 <-
structure(list(Name = structure(c(1L, 1L, 2L), .Label = c("apple",
"plum"), class = "factor"), Color = structure(c(3L, 1L, 2L), .Label = c("green",
"purple", "red"), class = "factor"), Freq = c(1, 2, 6)), .Names = c("Name",
"Color", "Freq"), row.names = c(NA, -3L), class = "data.frame")
This seems to be easier with a for loop as the Reduce or reduce (purrr) at a time takes only two datasets, so we can't have more than two suffixes in the merge.
Here, we created a vector of suffixes ('sfx'). Initialize an output dataset with the first list element. Then loop through the sequence of 'list.df' and do a sequential merge with the 'res' and the next element of list.df while updating the 'res' in each step
sfx <- c(".df1", ".df2", ".df3", ".df4")
res <- list.df[[1]]
for(i in head(seq_along(list.df), -1)) {
res <- merge(res, list.df[[i+1]], all = TRUE,
suffixes = sfx[i:(i+1)], by = c("Name", "Color"))
}
res
# Name Color Freq.df1 Freq.df2 Freq.df3 Freq.df4
#1 apple green 4 NA 8 2
#2 apple red 1 2 9 1
#3 banana yellow 3 3 7 NA
#4 plum purple 8 1 NA 6
I finally could make this one work using Reduce function itself. To do so I modified the input in a particular format.
As we could not pass the names of the data.frame as parameter inside the Reduce function, I created a list with an attribute n containing the name of the data.frame.
lst=list(list(n="df1",df=df1),list(n="df2",df=df2),list(n="df3",df=df3), list(n="df4",df=df4))
Around that I have built the logic to track the name of the data.frames being processed.
Reduce(function(x,y){
if(ncol(x$df)==3){
#df column names after 1st merge.
namecol=c('Name','Color',paste0("Freq.",x$n),paste0("Freq.",y$n))
}else{
#df column names for remaining merges.
namecol=c(colnames(x$df),paste0("Freq.",y$n))
}
df=merge.data.frame(x = x$df,y = y$df,by = c("Name","Color"),all = TRUE)
colnames(df)=namecol
list(n="df",df=df)},lst)
#$n
#[1] "df"
#$df
# Name Color Freq.df1 Freq.df2 Freq.df3 Freq.df4
#1 apple green 4 NA 8 2
#2 apple red 1 2 9 1
#3 banana yellow 3 3 7 NA
#4 plum purple 8 1 NA 6
The function eat of my package safejoin has such feature, if you give
it a named list of data.frames as a second input it will join them
recursively to the first input prefixing the new columns with this name.
we'll have to rename separately.
# devtools::install_github("moodymudskipper/safejoin")
library(safejoin)
library(dplyr)
eat(rename(df1,df1_Freq = Freq), lst(df2,df3,df4),
.by = c("Name","Color"), .mode= "full",.check="")
# Name Color df1_Freq df2_Freq df3_Freq df4_Freq
# 1 banana yellow 3 3 7 NA
# 2 apple red 1 2 9 1
# 3 apple green 4 NA 8 2
# 4 plum purple 8 1 NA 6
.mode = "full" is to make a full outer join, though here the default (left join), gives the same result.
.check = "" is to remove checks, which would warn that the factors have different levels among join columns.

R: rename columns in list based on other row value

I have imported data from matlab and have a large list (over 1000 list elements) from which I created the following sample dataset data with only two list elements.
data <- structure(list(TEST.DATA.1.1 = structure(list(ID = c(2, 2, 2), YEAR = c(1990, 1991, 1992), DATA.1 = c(10, 20, 30), DATA.NAME = structure(c(1L, 1L, 1L), class = "factor", .Label = "Test"), Remarks = c(1990, 1991, 1992)), .Names = c("ID", "YEAR", "DATA.1", "DATA.NAME", "Remarks"), row.names = c(NA, -3L), class = "data.frame"), TEST.DATA.2.1 = structure(list(ID = c(4, 4), YEAR = c(2000, 2001), DATA.1 = c(55, 60), DATA.2 = c(0, 2), DATA.3 = c(4, 6), DATA.NAME.structure..n1....Dim...c.1L..1L.. = structure(c(1L,1L), class = "factor", .Label = "n1"), DATA.NAME.structure..n2....Dim...c.1L..1L.. = structure(c(1L, 1L), class = "factor", .Label = "n2"), DATA.NAME.structure..n3....Dim...c.1L..1L.. = structure(c(1L,1L), class = "factor", .Label = "n3"), Remarks = c(2000,2001)), .Names = c("ID", "YEAR", "DATA.1", "DATA.2", "DATA.3", "DATA.NAME.structure..n1....Dim...c.1L..1L..", "DATA.NAME.structure..n2....Dim...c.1L..1L..", "DATA.NAME.structure..n3....Dim...c.1L..1L..", "Remarks"), row.names = c(NA, -2L), class = "data.frame")), .Names = c("TEST.DATA.1.1", "TEST.DATA.2.1"))
data
$TEST.DATA.1.1
ID YEAR DATA.1 DATA.NAME Remarks
1 2 1990 10 Test 1990
2 2 1991 20 Test 1991
3 2 1992 30 Test 1992
$TEST.DATA.2.1
ID YEAR DATA.1 DATA.2 DATA.3 DATA.NAME.structure..n1....Dim...c.1L..1L.. DATA.NAME.structure..n2....Dim...c.1L..1L.. DATA.NAME.structure..n3....Dim...c.1L..1L.. Remarks
1 4 2000 55 0 4 n1 n2 n3 2000
2 4 2001 60 2 6 n1 n2 n3 2001
I am looking for a way how I could rename the data columns with the name from the column(s) DATA.NAME. Sometimes there are multiple data columns and respective names such as in the second list element and sometimes there is only one such as in the first element. I am looking for a way to do the renaming for a large list (> 1000 list elements) and then drop the DATA.NAME columns such as in data_new.
data_new
$TEST.DATA.1.1
ID YEAR Test Remarks
1 2 1990 10 1990
2 2 1991 20 1991
3 2 1992 30 1992
$TEST.DATA.2.1
ID YEAR n1 n2 n3 Remarks
1 4 2000 55 0 4 2000
2 4 2001 60 2 6 2001
Here's a base R approach:
for (i in seq_along(data)) {
namecis <- grep('^DATA\\.NAME',names(data[[i]]));
datacis <- grep('^DATA\\.\\d+',names(data[[i]]));
names(data[[i]])[datacis] <- as.character(unlist(data[[i]][1,namecis]));
data[[i]][namecis] <- list(NULL);
};
data;
## $TEST.DATA.1.1
## ID YEAR Test Remarks
## 1 2 1990 10 1990
## 2 2 1991 20 1991
## 3 2 1992 30 1992
##
## $TEST.DATA.2.1
## ID YEAR n1 n2 n3 Remarks
## 1 4 2000 55 0 4 2000
## 2 4 2001 60 2 6 2001
Solution using data.table package.
require(data.table)
data <- structure(list(TEST.DATA.1.1 = structure(list(ID = c(2, 2, 2), YEAR = c(1990, 1991, 1992), DATA.1 = c(10, 20, 30), DATA.NAME = structure(c(1L, 1L, 1L), class = "factor", .Label = "Test"), Remarks = c(1990, 1991, 1992)), .Names = c("ID", "YEAR", "DATA.1", "DATA.NAME", "Remarks"), row.names = c(NA, -3L), class = "data.frame"), TEST.DATA.2.1 = structure(list(ID = c(4, 4), YEAR = c(2000, 2001), DATA.1 = c(55, 60), DATA.2 = c(0, 2), DATA.3 = c(4, 6), DATA.NAME.structure..n1....Dim...c.1L..1L.. = structure(c(1L,1L), class = "factor", .Label = "n1"), DATA.NAME.structure..n2....Dim...c.1L..1L.. = structure(c(1L, 1L), class = "factor", .Label = "n2"), DATA.NAME.structure..n3....Dim...c.1L..1L.. = structure(c(1L,1L), class = "factor", .Label = "n3"), Remarks = c(2000,2001)), .Names = c("ID", "YEAR", "DATA.1", "DATA.2", "DATA.3", "DATA.NAME.structure..n1....Dim...c.1L..1L..", "DATA.NAME.structure..n2....Dim...c.1L..1L..", "DATA.NAME.structure..n3....Dim...c.1L..1L..", "Remarks"), row.names = c(NA, -2L), class = "data.frame")), .Names = c("TEST.DATA.1.1", "TEST.DATA.2.1"))
fun <- function(x) {
x <- data.table(x)
var1 <- grep("DATA.[0-9]", names(x), value = T)
var2 <- as.character(unlist(x[1, grep("DATA.NAME", names(x)), with = F]))
setnames(x, var1, var2)
x[, grep("DATA.NAME", names(x)) := NULL, with = F]
return(x)
}
data_new <- lapply(data, fun)
This should work...
library(dplyr)
for (i in 1:length(data))
{
d <- data[[i]]
# Find the new names
new_names <- select(d, starts_with('DATA.NAME'))
new_names <- unlist(new_names[1,])
names(new_names) <- NULL
new_names <- as.character(new_names)
# Remove the columns containing the names
d <- select(d, -starts_with('DATA.NAME'))
# Pick which columns we want to replace
old_names <- names(d)
to_replace <- grep('DATA.[0-9]+', old_names)
# Replace those names
names(d)[to_replace] <- new_names
#Replace the list element
data[[i]] <- d
}

Extract each column of a data frame into an object

I have a data frame with many columns, named foo, bar, etc.
I would like to extract each column of the data frame to separate objects called foo, bar and so on. Is there an automated way to do this in R?
Working example:
mock <- structure(list(
x = structure(1:3, .Label = c("1", "2", "3"), class = "factor"),
y = structure(1:3, .Label = c("A", "B", "C"), class = "factor"),
z = structure(c(1L, 1L, 2L), .Label = c("0", "1"), class = "factor")),
.Names = c("x", "y", "z"), row.names = c(NA, -3L), class = "data.frame")
Output:
> mock
x y z
1 1 A 0
2 2 B 0
3 3 C 1
How can I write a loop that creates objects x, y and z from the three columns of this data frame?
> for (i in 1:ncol(mock)) {
+ assign(names(mock)[i],mock[,i])
+ }
> x
[1] 1 2 3
Levels: 1 2 3
> y
[1] A B C
Levels: A B C
> z
[1] 0 0 1
Levels: 0 1
You should be careful with the use of assign, though. You can achieve almost the same result using attach(mock), which is reversible (detach()) and won't unintentionally overwrite existing variables (it just masks them).

Resources