Reduce() in R over similar variable names causing error - r

I have 19 nested lists generated from a lapply and split operation.
These lists are in the form:
#list1
Var col1 col2 col3
A 2 3 4
B 3 4 5
#list2
Var col1 col2 col3
A 5 6 7
B 5 4 4
......
#list19
Var col1 col2 col3
A 3 6 7
B 7 4 4
I have been able to merge the lists with
merge.all <- function(x, y) merge(x, y, all=TRUE, by="Var")
out <- Reduce(merge.all, DataList)
I am however getting an error due to the similarity in the names of the other columns.
How can I concatenate the name of the list to the variable names so that I get something like this:
Var list1.col1 list1.col2 list1.col3 .......... list19.col3
A 2 3 4 7
B 3 4 5 .......... 4

I'm really sure somebody will come up with a much, much better solution. However, if you're after a quick and dirty solution, this seems to work.
My plan was to simply change the column names prior to merging.
#Sample Data
df1 <- data.frame(Var = c("A","B"), col1 = c(2,3), col2 = c(3,4), col3 = c(4,5))
df2 <- data.frame(Var = c("A","B"), col1 = c(5,5), col2 = c(6,4), col3 = c(7,5))
df19 <- data.frame(Var = c("A","B"), col1 = c(3,7), col2 = c(6,4), col3 = c(7,4))
mylist <- list(df1, df2, df19)
names(mylist) <- c("df1", "df2", "df19") #just manually naming, presumably your list has names
## Change column names by pasting name of dataframe in list with standard column names. - using ugly mix of `lapply` and a `for` loop:
mycolnames <- colnames(df1)
mycolnames1 <- lapply(names(mylist), function(x) paste0(x, mycolnames))
for(i in 1:length(mylist)){
colnames(mylist[[i]]) <- mycolnames1[[i]]
colnames(mylist[[i]])[1] <- "Var" #put Var back in so you can merge
}
## Merge
merge.all <- function(x, y)
merge(x, y, all=TRUE, by="Var")
out <- Reduce(merge.all, mylist)
out
# Var df1col1 df1col2 df1col3 df2col1 df2col2 df2col3 df19col1 df19col2 df19col3
#1 A 2 3 4 5 6 7 3 6 7
#2 B 3 4 5 5 4 5 7 4 4
There you go - it works but is very ugly.

To set the data frame names unique, you could use a function to set all list names that are not the merging variable to unique names.
resetNames <- function(x, byvar = "Var") {
asrl <- as.relistable(lapply(x, names))
allnm <- names(unlist(x, recursive = FALSE))
rpl <- replace(allnm, unlist(asrl) %in% byvar, byvar)
Map(setNames, x, relist(rpl, asrl))
}
Reduce(merge.all, resetNames(dlist))
# Var list1.col1 list1.col2 list1.col3 list2.col1 list2.col2 list2.col4 list3.col1
#1 A 2 3 4 5 6 7 3
#2 B 3 4 5 5 4 4 7
# list3.col2 list3.col3 list4.col1 list4.col2 list4.col3
#1 6 7 3 6 7
#2 4 4 4 5 6
when run your list with an added data frame there are no warnings. And there's always data table. Its merge method does not return a warning for duplicated column names.
library(data.table)
Reduce(merge.all, lapply(dlist, as.data.table))
Another option is to check the names as the data enters the function, change them there, and then you can avoid the warning. This isn't perfect but it works ok here.
merge.all <- function(x, y) {
m <- match(names(y)[-1], gsub("[.](x|y)$", "", names(x)[-1]), 0L)
names(y)[-1][m] <- paste0(names(y)[-1][m], "DUPE")
merge(x, y, all=TRUE, by="Var")
}
rm <- Reduce(merge.all, dlist)
names(rm)
# [1] "Var" "col1" "col2" "col3" "col1DUPE.x"
# [6] "col2DUPE.x" "col4" "col1DUPE.y" "col2DUPE.y" "col3DUPE.x"
# [11] "col1DUPE" "col2DUPE" "col3DUPE.y"
where dlist is
structure(list(list1 = structure(list(Var = structure(1:2, .Label = c("A",
"B"), class = "factor"), col1 = 2:3, col2 = 3:4, col3 = 4:5), .Names = c("Var",
"col1", "col2", "col3"), class = "data.frame", row.names = c(NA,
-2L)), list2 = structure(list(Var = structure(1:2, .Label = c("A",
"B"), class = "factor"), col1 = c(5L, 5L), col2 = c(6L, 4L),
col4 = c(7L, 4L)), .Names = c("Var", "col1", "col2", "col4"
), class = "data.frame", row.names = c(NA, -2L)), list3 = structure(list(
Var = structure(1:2, .Label = c("A", "B"), class = "factor"),
col1 = c(3L, 7L), col2 = c(6L, 4L), col3 = c(7L, 4L)), .Names = c("Var",
"col1", "col2", "col3"), class = "data.frame", row.names = c(NA,
-2L)), list4 = structure(list(Var = structure(1:2, .Label = c("A",
"B"), class = "factor"), col1 = 3:4, col2 = c(6L, 5L), col3 = c(7L,
6L)), .Names = c("Var", "col1", "col2", "col3"), row.names = c(NA,
-2L), class = "data.frame")), .Names = c("list1", "list2", "list3",
"list4"))

Related

Verifyin if there's at least two columns have the same value in a specefic column

i have a data and i want to see if my variables they all have unique value in specefic row
let's say i want to analyze row D
my data
Name F S T
A 1 2 3
B 2 3 4
C 3 4 5
D 4 5 6
> TRUE (because all the three variables have unique value)
Second example
Name F S T
A 1 2 3
B 2 3 4
C 3 4 5
D 4 5 4
>False (because F and T have the same value in row D )
In base R do
f1 <- function(dat, ind) {
tmp <- unlist(dat[ind, -1])
length(unique(tmp)) == length(tmp)
}
-testing
> f1(df, 4)
[1] TRUE
> f1(df1, 4)
[1] FALSE
data
df <- structure(list(Name = c("A", "B", "C", "D"), F = 1:4, S = 2:5,
T = 3:6), class = "data.frame", row.names = c(NA, -4L))
df1 <- structure(list(Name = c("A", "B", "C", "D"), F = 1:4, S = 2:5,
T = c(3L, 4L, 5L, 4L)), class = "data.frame", row.names = c(NA,
-4L))
You can use dplyr for this:
df %>%
summarize_at(c(2:ncol(.)), n_distinct) %>%
summarize(if_all(.fns = ~ .x == nrow(df)))

Add two R data frames of different sizes [duplicate]

This question already has answers here:
How to join (merge) data frames (inner, outer, left, right)
(13 answers)
Closed 2 years ago.
If two data frames are
symbol wgt
1 A 2
2 C 4
3 D 6
symbol wgt
1 A 20
2 D 10
how can I add them so that missing observations for a "symbol" in either data frame are treated as zero, giving
symbol wgt
1 A 22
2 C 4
3 D 16
You can join the two dataframes by symbol , replace NA with 0 and add the two weights.
library(dplyr)
df1 %>%
left_join(df2, by = 'symbol') %>%
mutate(wgt.y = replace(wgt.y, is.na(wgt.y), 0),
wgt = wgt.x + wgt.y) %>%
select(-wgt.x, -wgt.y)
# symbol wgt
#1 A 22
#2 C 4
#3 D 16
data
df1 <- structure(list(symbol = c("A", "C", "D"), wgt = c(2L, 4L, 6L)),
class = "data.frame", row.names = c(NA, -3L))
df2 <- structure(list(symbol = c("A", "D"), wgt = c(20L, 10L)),
class = "data.frame", row.names = c(NA, -2L))
Try this one line solution by pipes:
#Data
library(dplyr)
df1 <- structure(list(symbol = c("A", "C", "D"), wgt = c(2L, 4L, 6L)), class = "data.frame", row.names = c("1",
"2", "3"))
df2 <- structure(list(symbol = c("A", "D"), wgt = c(20L, 10L)), class = "data.frame", row.names = c("1",
"2"))
#Code
df1 %>% left_join(df2,by = 'symbol') %>% mutate(wgt = rowSums(.[-1],na.rm=T)) %>% select(c(1,4))
symbol wgt
1 A 22
2 C 4
3 D 16
With data.table and the data provided in the answer of #RonakShah and #Duck the solution could be a simple aggregation:
# Convert data.frame to data.table (very fast since inplace)
setDT(df1)
setDT(df2)
# combine both data.frames into one data.frame, group by symbol, apply the sum (NAs are ignored = counted as zero)
rbind(df1,df2)[, sum(wgt, na.rm = TRUE), by = symbol]
# Output
symbol V1
1: A 22
2: C 4
3: D 16
Note: If you want to use base R only (without data.table) you could use aggregate instead:
aggregate(wgt ~ symbol, rbind(df1,df2), sum)

conditionally add rows to data frames in a list of data frames

I have a list of data frames, where each data frame has either 1 or 2 rows named "mis" or "syn" (form a column named cat) and a second col with a numeric frequency. I want to fill in each data frame such that if the "mis" row is missing, it adds a mis row with frequency = 0
and if "syn" row is missing, is adds a syn row with frequency 1:
###exmaple:
#example list of dataframes:
df1<- as.data.frame(cbind(cat = c("mis", "syn"), freq= c(4, 2)))
df2<- as.data.frame(cbind(cat = "mis", freq= 1))
df3<- as.data.frame(cbind(cat = "syn", freq= 2))
df_list<- list(df1 = df1, df2 = df2, df3= df3)
looks like:
> df_list
$df1
cat freq
1 mis 4
2 syn 2
$df2
cat freq
1 mis 1
$df3
cat freq
1 syn 2
Expected output:
> df_list
$df1
cat freq
mis 4
syn 2
$df2
cat freq
mis 1
syn 1
$df3
cat freq
syn 2
mis 0
what I've tried:
first I change the row names so that I can search by them
df_list_named<- lapply(df_list, function(x){ row.names(x)<-as.character(x$cat); x})
df_list_named
$df1
cat freq
mis mis 4
syn syn 2
$df2
cat freq
mis mis 1
$df3
cat freq
syn syn 2
then I've been trying to use an ifelse loop to append the rows to the the dataframes where it needs it, but I can't get it to work:
test<- lapply(df_list_named, function (x) ifelse(!row.names(df_list_named[[x]]) %in% "mis", rbind(df_list_named[[x]], c(cat = "mis", freq= 0)),
ifelse(!row.names(df_list_named[[x]]) %in% "syn", rbind(df_list_named[[x]], c(cat = "syn", freq= 1))))
Here is one way to do it with lapply
lapply(df_list, function(x) {
if(all(c("mis", "syn") %in% x$cat))
x
else if("mis" %in% x$cat)
rbind(x, data.frame(cat = "syn", freq = 1))
else
rbind(x, data.frame(cat = "mis", freq = 0))
})
#$df1
# cat freq
#1 mis 4
#2 syn 2
#$df2
# cat freq
#1 mis 1
#2 syn 1
#$df3
# cat freq
#1 syn 2
#2 mis 0
data
df1<- data.frame(cat = c("mis", "syn"), freq= c(4, 2), stringsAsFactors = FALSE)
df2<- data.frame(cat = "mis", freq= 1,stringsAsFactors = FALSE)
df3<- data.frame(cat = "syn", freq= 2, stringsAsFactors = FALSE)
df_list<- list(df1 = df1, df2 = df2, df3= df3)
You could use a "base" data frame, merge it with all data frames in the list using Map. The duplicated rows created in the already complete data frames can be safely excluded with !, as they are always placed at the end.
(base <- data.frame(cat=factor(c("syn", "mis")), freq=factor(1:0)))
# cat freq
# 1 syn 1
# 2 mis 0
Map(function(x) {y <- (merge(x, base, all=TRUE));y[!duplicated(y$cat), ]}, df_list)
# $df1
# cat freq
# 1 mis 4
# 3 syn 2
#
# $df2
# cat freq
# 1 mis 1
# 3 syn 1
#
# $df3
# cat freq
# 1 syn 2
# 3 mis 0
Data
df_list <- list(df1 = structure(list(cat = structure(1:2, .Label = c("mis",
"syn"), class = "factor"), freq = structure(2:1, .Label = c("2",
"4"), class = "factor")), class = "data.frame", row.names = c(NA,
-2L)), df2 = structure(list(cat = structure(c(cat = 1L), .Label = "mis", class = "factor"),
freq = structure(c(freq = 1L), .Label = "1", class = "factor")), class = "data.frame", row.names = c(NA,
-1L)), df3 = structure(list(cat = structure(c(cat = 1L), .Label = "syn", class = "factor"),
freq = structure(c(freq = 1L), .Label = "2", class = "factor")), class = "data.frame", row.names = c(NA,
-1L)))

Replace a subset of data frame

I have a data frame with some error
T item V1 V2
1 a 2 .1
2 a 5 .8
1 b 1 .7
2 b 2 .2
I have another data frame with corrections for items concerning V1 only
T item V1
1 a 2
2 a 6
How do I get the final data frame? Should I use merge or rbind. Note: actual data frames are big.
An option would be a data.table join on the 'T', 'item' and assigning the 'V1' with the the corresponding 'V1' column (i.V1) from the second dataset
library(data.table)
setDT(df1)[df2, V1 := i.V1, on = .(T, item)]
df1
# T item V1 V2
#1: 1 a 2 0.1
#2: 2 a 6 0.8
#3: 1 b 1 0.7
#4: 2 b 2 0.2
data
df1 <- structure(list(T = c(1L, 2L, 1L, 2L), item = c("a", "a", "b",
"b"), V1 = c(2L, 5L, 1L, 2L), V2 = c(0.1, 0.8, 0.7, 0.2)),
class = "data.frame", row.names = c(NA, -4L))
df2 <- structure(list(T = 1:2, item = c("a", "a"), V1 = c(2L, 6L)),
class = "data.frame", row.names = c(NA,
-2L))
This should work -
library(dplyr)
df1 %>%
left_join(df2, by = c("T", "item")) %>%
mutate(
V1 = coalesce(as.numeric(V1.y), as.numeric(V1.x))
) %>%
select(-V1.x, -V1.y)

R paste0 2 columns if not NA

I would like to paste0 two columns if the element in one column is not NA.If one element of one columns is NA then keep the element of the other column only.
structure(list(col1 = structure(1:3, .Label = c("A", "B", "C"),
class = "factor"), col2 = c(1, NA, 3)), .Names = c("col1", "col2"),
class = "data.frame",row.names = c(NA, -3L))
# col1 col2
# 1 A 1
# 2 B NA
# 3 C 3
structure(list(col1 = structure(1:3, .Label = c("A", "B", "C"),
class = "factor"),col2 = c(1, NA, 3), col3 = c("A|1", "B", "C|3")),
.Names = c("col1", "col2", "col3"), row.names = c(NA,-3L),
class = "data.frame")
# col1 col2 col3
#1 A 1 A|1
#2 B NA B
#3 C 3 C|3
you can also do it with regular expressions:
df$col3 <- sub("NA\\||\\|NA", "", with(df, paste0(col1, "|", col2)))
That is, paste them in regular way and then replace any "NA|" or "|NA" with "". Note that | needs to be "double escaped" because it means "OR" in regexps, that's why the strange pattern NA\\||\\|NA means actually "NA|" OR "|NA".
As #Roland says, this is easy using ifelse (just translate the mental logic into a series of nested ifelse statements):
x <- transform(x,col3=ifelse(is.na(col1),as.character(col2),
ifelse(is.na(col2),as.character(col1),
paste0(col1,"|",col2))))
update: need as.character in some cases.
Try:
> df$col1 = as.character(df$col1)
> df$col3 = with(df, ifelse(is.na(col1),col2, ifelse(is.na(col2), col1, paste0(col1,'|',col2))))
> df
col1 col2 col3
1 A 1 A|1
2 B NA B
3 C 3 C|3
You could also do:
library(stringr)
df$col3 <- apply(df, 1, function(x)
paste(str_trim(x[!is.na(x)]), collapse="|"))
df
# col1 col2 col3
#1 A 1 A|1
#2 B NA B
#3 C 3 C|3

Resources