Combine two matrix and mark common - r

I have two matrix as like this
Vehicle1 Year type
Car1 20 A
Car2 21 A
Car8 20 A
Second one
Vehicle2 Year type
Car1 20 M
Car2 21 M
Car7 90 M
I just need to combine the matrix based on the first column(Vehicle) and need to mark common as A/M as like this
Vehicle Year type
Car1 20 A/M
Car2 21 A/M
Car7 90 M
Car8 20 A
I used merge function for this but it only printing the common one

You can join the two dataframe and combine the type columns :
dplyr::full_join(df1, df2, by = c('Vehicle1' = 'Vehicle2', 'Year')) %>%
tidyr::unite(type, type.x, type.y, sep = '/', na.rm = TRUE)
# Vehicle1 Year type
#1 Car1 20 A/M
#2 Car2 21 A/M
#3 Car8 20 A
#4 Car7 90 M
data
df1 <- structure(list(Vehicle1 = c("Car1", "Car2", "Car8"), Year = c(20L,
21L, 20L), type = c("A", "A", "A")), class = "data.frame", row.names = c(NA, -3L))
df2 <- structure(list(Vehicle2 = c("Car1", "Car2", "Car7"), Year = c(20L,
21L, 90L), type = c("M", "M", "M")), class = "data.frame", row.names = c(NA, -3L))

Another dplyr solution.
library(dplyr)
df2 %>%
rename(Vehicle1 = Vehicle2) %>%
bind_rows(df1, .) %>%
group_by(Vehicle1, Year) %>%
summarise(type = paste(type, collapse = "/")) %>%
ungroup()
# # A tibble: 4 x 3
# Vehicle1 Year type
# <chr> <int> <chr>
# 1 Car1 20 A/M
# 2 Car2 21 A/M
# 3 Car7 90 M
# 4 Car8 20 A

You can also do this easily in base R.
rr <- merge(m1, m2, all=T, by.x="Vehicle1", by.y="Vehicle2")
rr <- setNames(na.omit(reshape(rr, idvar="Vehicle1", varying=list(c(2, 4), c(3, 5)),
direction="long")), c("Vehicle1", "t", names(m1)[-1]))
dupes <- which(duplicated(rr$Vehicle1))
rr[rr$Vehicle1 %in% rr$Vehicle1[dupes], 4] <- "A/M"
res <- rr[-dupes, -2]
res
# Vehicle1 Year type
# Car1.1 Car1 20 A/M
# Car2.1 Car2 21 A/M
# Car8.1 Car8 20 A
# Car7.2 Car7 90 M
Data:
m1 <- structure(list(Vehicle1 = c("Car1", "Car2", "Car8"), Year = c(20L,
21L, 20L), type = c("A", "A", "A")), class = "data.frame", row.names = c(NA,
-3L))
m2 <- structure(list(Vehicle2 = c("Car1", "Car2", "Car7"), Year = c(20L,
21L, 90L), type = c("M", "M", "M")), class = "data.frame", row.names = c(NA,
-3L))

Here is a base R option using merge
tmp <- merge(df1, df2, by.x = c("Vehicle1", "Year"), by.y = c("Vehicle2", "Year"), all = TRUE)
dfout <- cbind(tmp[c("Vehicle1", "Year")],
type = apply(
tmp[grep("type", names(tmp))],
1,
function(...) ifelse(any(is.na(...)), na.omit(...), paste0(..., collapse = "/"))
)
)
such that
> dfout
Vehicle1 Year type
1 Car1 20 A/M
2 Car2 21 A/M
3 Car7 90 M
4 Car8 20 A
Data
> dput(df1)
structure(list(Vehicle1 = c("Car1", "Car2", "Car8"), Year = c(20L,
21L, 20L), type = c("A", "A", "A")), class = "data.frame", row.names = c(NA,
-3L))
> dput(df2)
structure(list(Vehicle2 = c("Car1", "Car2", "Car7"), Year = c(20L,
21L, 90L), type = c("M", "M", "M")), class = "data.frame", row.names = c(NA,
-3L))

Related

How to add na based on condition for a whole dataframe

I just want to know how to find and replace empty columns into na for a whole data frame
sample data
structure(list(id = structure(8.44425875736171e-318, class = "integer64"),
project_id = 11L, experiment_id = 85L,
gene = "", si = -0.381, pi = ""
on1 = "CC",
on2 = "GG",
on3 = "aa",
created_at = structure(1618862091.85075, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x000001ba09da3590>)
i have a solution to check for a particular column but i dont how to apply this for whole dataframe
data$gene <- ifelse((is.na(data$gene) == TRUE),'NA',data$gene)
You could use lapply with gsub to replace each empty cell with NA like this:
df <- structure(list(id = structure(8.44425875736171e-318, class = "integer64"),
project_id = 11L, experiment_id = 85L,
gene = "", si = -0.381, pi = "",
on1 = "CC",
on2 = "GG",
on3 = "aa",
created_at = structure(1618862091.85075, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("data.table",
"data.frame"))
df
#> id project_id experiment_id gene si pi on1 on2 on3
#> 1 8.444259e-318 11 85 -0.381 CC GG aa
#> created_at
#> 1 2021-04-19 19:54:51
df[] <- lapply(df, function(x) gsub("^$", NA, x))
df
#> id project_id experiment_id gene si pi on1 on2 on3
#> 1 8.44425875736171e-318 11 85 <NA> -0.381 <NA> CC GG aa
#> created_at
#> 1 2021-04-19 19:54:51
Created on 2022-11-02 with reprex v2.0.2
You can also use dplyr with mutate and across
library(dplyr)
library(tidyr)
df <- structure(list(id = structure(8.44425875736171e-318, class = "integer64"),
project_id = 11L, experiment_id = 85L,
gene = "", si = -0.381, pi = "",
on1 = "CC",
on2 = "GG",
on3 = "aa",
created_at = structure(1618862091.85075, class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -1L), class = c("data.table",
"data.frame"))
df %>%
mutate(dplyr::across(where(is.character), ~ gsub("^$", NA, .x)))
Note that I also attempted to use replace_na, however this only works on values that are actually NA.
test %>%
mutate(dplyr::across(where(is.character), ~ replace_na(.x, "NA")))
"" is not considered
NA is considered NA
Keep that in mind while you are performing your analysis.
Using na_if
library(data.table)
library(dplyr)
df[, lapply(.SD, \(x) if(is.character(x)) na_if(x, "") else x)]
-output
id project_id experiment_id gene si pi on1 on2 on3 created_at
<i64> <int> <int> <char> <num> <char> <char> <char> <char> <POSc>
1: 1709137 11 85 <NA> -0.381 <NA> CC GG aa 2021-04-19 19:54:51

dplyr join with three data frame

I have 3 data frames as like this
df1 <- structure(list(Vehicle = c("Car1", "Car2", "Car8"), Year = c(20L,
21L, 20L), type = c("A", "A", "A")), class = "data.frame", row.names = c(NA, -3L))
df2 <- structure(list(Vehicle = c("Car1", "Car2", "Car7"), Year = c(20L,
21L, 90L), type = c("M", "M", "M")), class = "data.frame", row.names = c(NA, -3L))
df3 <- structure(list(Vehicle = c("Car1", "Car2", "Car9"), Year = c(20L,
21L, 92L), type = c("I", "I", "I")), class = "data.frame", row.names = c(NA, -3L))
And I need to make a new table as follows
Vehicle Year type
Car1 20 A/M/I
Car2 21 A/M/I
Car7 90 M
Car8 20 A
Car9 92 I
for this purpose I used this code using dplyr as like this, but it is not working with 3 data frames:
dplyr::full_join(df1, df2, df3, by = c('Vehicle', 'Year')) %>%
tidyr::unite(type, type.x, type.y, sep = '/', na.rm = TRUE)
Try this approach. Instead of merging it looks like you want to combine all dataframes and then aggregate. Here the code using dplyr:
library(dplyr)
#Code
newdf <- bind_rows(df1,df2,df3) %>%
group_by(Vehicle,Year) %>%
summarise(type=paste0(type,collapse='|'))
Output:
# A tibble: 5 x 3
# Groups: Vehicle [5]
Vehicle Year type
<chr> <int> <chr>
1 Car1 20 A|M|I
2 Car2 21 A|M|I
3 Car7 90 M
4 Car8 20 A
5 Car9 92 I
Generally, to merge >2 data.frame's/tibble's you'd use either base R's Reduce or purrr::reduce; for example using the latter:
list(df1, df2, df3) %>%
purrr::reduce(dplyr::full_join, by = c("Vehicle", "Year")) %>%
tidyr::unite(type, dplyr::starts_with("type"), sep = "/", na.rm = TRUE)
# Vehicle Year type
#1 Car1 20 A/M/I
#2 Car2 21 A/M/I
#3 Car8 20 A
#4 Car7 90 M
#5 Car9 92 I
Using base R
aggregate(type ~ Vehicle + Year, rbind(df1, df2, df3) ,
FUN = paste, collapse="|")
-output
# Vehicle Year type
#1 Car1 20 A|M|I
#2 Car8 20 A
#3 Car2 21 A|M|I
#4 Car7 90 M
#5 Car9 92 I

Merging two df One to Many within List - R

To start I will ignore the use of lists and show what I want using two df's.
I have df1
ID v1 Join_ID
1 100 1
2 110 2
3 150 3
And df2
Join_ID Type v2
1 a 80
1 b 90
2 a 70
2 b 60
3 a 50
3 b 40
I want the df.join to be:
ID v1 Join_ID a_v2 b_v2
1 100 1 80 90
2 110 2 70 60
3 150 3 50 40
I have tried:
df.merged <- merge(df1, df2, by="Join_ID")
df.wide <- dcast(melt(df.merged, id.vars=c("ID", "type")), ID~variable+type)
But this repeats all the variables in df1 for each type: v1_a v1_b
On top of this I have two lists
list.1
df1_a
df1_b
df1_c
list.2
df2_a
df2_b
df2_c
And I want the df1_a in list 1 to join with the df2_a in list 2
We can do this with maping through the list elements and then do the join
library(tidyverse)
map2(list.1, list.2, ~
.y %>%
mutate(Type = paste0(Type, "_v2")) %>%
spread(Type, v2) %>%
inner_join(.x, by = 'Join_ID'))
data
df1 <- structure(list(ID = 1:3, v1 = c(100L, 110L, 150L), Join_ID = 1:3),
.Names = c("ID",
"v1", "Join_ID"), class = "data.frame", row.names = c(NA, -3L
))
df2 <- structure(list(Join_ID = c(1L, 1L, 2L, 2L, 3L, 3L), Type = c("a",
"b", "a", "b", "a", "b"), v2 = c(80L, 90L, 70L, 60L, 50L, 40L
)), .Names = c("Join_ID", "Type", "v2"), class = "data.frame", row.names = c(NA,
-6L))
list.1 <- list(df1_a = df1, df1_b = df1, df1_c = df1)
list.2 <- list(df2_a = df2, df2_b = df2, df2_c = df2)
Some replies to your request :
1. the reshaping of df2
2. the join with different column names
library(reshape2)
df1=data.frame(id=c(1,2,3), v1=c(100,110,150))
df2=data.frame(Join_ID=c(1,1,2,2,3,3),Type=c("a","b","a","b","a","b"),v2=c(80,90,70,60,50,40))
cast_df2=dcast(df2, Join_ID ~ Type)
mergedData <- full_join(df1,cast_df2, by=c("id"="Join_ID"),suffixes=c("_df1","_df2") )

R: rename columns in list based on other row value

I have imported data from matlab and have a large list (over 1000 list elements) from which I created the following sample dataset data with only two list elements.
data <- structure(list(TEST.DATA.1.1 = structure(list(ID = c(2, 2, 2), YEAR = c(1990, 1991, 1992), DATA.1 = c(10, 20, 30), DATA.NAME = structure(c(1L, 1L, 1L), class = "factor", .Label = "Test"), Remarks = c(1990, 1991, 1992)), .Names = c("ID", "YEAR", "DATA.1", "DATA.NAME", "Remarks"), row.names = c(NA, -3L), class = "data.frame"), TEST.DATA.2.1 = structure(list(ID = c(4, 4), YEAR = c(2000, 2001), DATA.1 = c(55, 60), DATA.2 = c(0, 2), DATA.3 = c(4, 6), DATA.NAME.structure..n1....Dim...c.1L..1L.. = structure(c(1L,1L), class = "factor", .Label = "n1"), DATA.NAME.structure..n2....Dim...c.1L..1L.. = structure(c(1L, 1L), class = "factor", .Label = "n2"), DATA.NAME.structure..n3....Dim...c.1L..1L.. = structure(c(1L,1L), class = "factor", .Label = "n3"), Remarks = c(2000,2001)), .Names = c("ID", "YEAR", "DATA.1", "DATA.2", "DATA.3", "DATA.NAME.structure..n1....Dim...c.1L..1L..", "DATA.NAME.structure..n2....Dim...c.1L..1L..", "DATA.NAME.structure..n3....Dim...c.1L..1L..", "Remarks"), row.names = c(NA, -2L), class = "data.frame")), .Names = c("TEST.DATA.1.1", "TEST.DATA.2.1"))
data
$TEST.DATA.1.1
ID YEAR DATA.1 DATA.NAME Remarks
1 2 1990 10 Test 1990
2 2 1991 20 Test 1991
3 2 1992 30 Test 1992
$TEST.DATA.2.1
ID YEAR DATA.1 DATA.2 DATA.3 DATA.NAME.structure..n1....Dim...c.1L..1L.. DATA.NAME.structure..n2....Dim...c.1L..1L.. DATA.NAME.structure..n3....Dim...c.1L..1L.. Remarks
1 4 2000 55 0 4 n1 n2 n3 2000
2 4 2001 60 2 6 n1 n2 n3 2001
I am looking for a way how I could rename the data columns with the name from the column(s) DATA.NAME. Sometimes there are multiple data columns and respective names such as in the second list element and sometimes there is only one such as in the first element. I am looking for a way to do the renaming for a large list (> 1000 list elements) and then drop the DATA.NAME columns such as in data_new.
data_new
$TEST.DATA.1.1
ID YEAR Test Remarks
1 2 1990 10 1990
2 2 1991 20 1991
3 2 1992 30 1992
$TEST.DATA.2.1
ID YEAR n1 n2 n3 Remarks
1 4 2000 55 0 4 2000
2 4 2001 60 2 6 2001
Here's a base R approach:
for (i in seq_along(data)) {
namecis <- grep('^DATA\\.NAME',names(data[[i]]));
datacis <- grep('^DATA\\.\\d+',names(data[[i]]));
names(data[[i]])[datacis] <- as.character(unlist(data[[i]][1,namecis]));
data[[i]][namecis] <- list(NULL);
};
data;
## $TEST.DATA.1.1
## ID YEAR Test Remarks
## 1 2 1990 10 1990
## 2 2 1991 20 1991
## 3 2 1992 30 1992
##
## $TEST.DATA.2.1
## ID YEAR n1 n2 n3 Remarks
## 1 4 2000 55 0 4 2000
## 2 4 2001 60 2 6 2001
Solution using data.table package.
require(data.table)
data <- structure(list(TEST.DATA.1.1 = structure(list(ID = c(2, 2, 2), YEAR = c(1990, 1991, 1992), DATA.1 = c(10, 20, 30), DATA.NAME = structure(c(1L, 1L, 1L), class = "factor", .Label = "Test"), Remarks = c(1990, 1991, 1992)), .Names = c("ID", "YEAR", "DATA.1", "DATA.NAME", "Remarks"), row.names = c(NA, -3L), class = "data.frame"), TEST.DATA.2.1 = structure(list(ID = c(4, 4), YEAR = c(2000, 2001), DATA.1 = c(55, 60), DATA.2 = c(0, 2), DATA.3 = c(4, 6), DATA.NAME.structure..n1....Dim...c.1L..1L.. = structure(c(1L,1L), class = "factor", .Label = "n1"), DATA.NAME.structure..n2....Dim...c.1L..1L.. = structure(c(1L, 1L), class = "factor", .Label = "n2"), DATA.NAME.structure..n3....Dim...c.1L..1L.. = structure(c(1L,1L), class = "factor", .Label = "n3"), Remarks = c(2000,2001)), .Names = c("ID", "YEAR", "DATA.1", "DATA.2", "DATA.3", "DATA.NAME.structure..n1....Dim...c.1L..1L..", "DATA.NAME.structure..n2....Dim...c.1L..1L..", "DATA.NAME.structure..n3....Dim...c.1L..1L..", "Remarks"), row.names = c(NA, -2L), class = "data.frame")), .Names = c("TEST.DATA.1.1", "TEST.DATA.2.1"))
fun <- function(x) {
x <- data.table(x)
var1 <- grep("DATA.[0-9]", names(x), value = T)
var2 <- as.character(unlist(x[1, grep("DATA.NAME", names(x)), with = F]))
setnames(x, var1, var2)
x[, grep("DATA.NAME", names(x)) := NULL, with = F]
return(x)
}
data_new <- lapply(data, fun)
This should work...
library(dplyr)
for (i in 1:length(data))
{
d <- data[[i]]
# Find the new names
new_names <- select(d, starts_with('DATA.NAME'))
new_names <- unlist(new_names[1,])
names(new_names) <- NULL
new_names <- as.character(new_names)
# Remove the columns containing the names
d <- select(d, -starts_with('DATA.NAME'))
# Pick which columns we want to replace
old_names <- names(d)
to_replace <- grep('DATA.[0-9]+', old_names)
# Replace those names
names(d)[to_replace] <- new_names
#Replace the list element
data[[i]] <- d
}

Reduce() in R over similar variable names causing error

I have 19 nested lists generated from a lapply and split operation.
These lists are in the form:
#list1
Var col1 col2 col3
A 2 3 4
B 3 4 5
#list2
Var col1 col2 col3
A 5 6 7
B 5 4 4
......
#list19
Var col1 col2 col3
A 3 6 7
B 7 4 4
I have been able to merge the lists with
merge.all <- function(x, y) merge(x, y, all=TRUE, by="Var")
out <- Reduce(merge.all, DataList)
I am however getting an error due to the similarity in the names of the other columns.
How can I concatenate the name of the list to the variable names so that I get something like this:
Var list1.col1 list1.col2 list1.col3 .......... list19.col3
A 2 3 4 7
B 3 4 5 .......... 4
I'm really sure somebody will come up with a much, much better solution. However, if you're after a quick and dirty solution, this seems to work.
My plan was to simply change the column names prior to merging.
#Sample Data
df1 <- data.frame(Var = c("A","B"), col1 = c(2,3), col2 = c(3,4), col3 = c(4,5))
df2 <- data.frame(Var = c("A","B"), col1 = c(5,5), col2 = c(6,4), col3 = c(7,5))
df19 <- data.frame(Var = c("A","B"), col1 = c(3,7), col2 = c(6,4), col3 = c(7,4))
mylist <- list(df1, df2, df19)
names(mylist) <- c("df1", "df2", "df19") #just manually naming, presumably your list has names
## Change column names by pasting name of dataframe in list with standard column names. - using ugly mix of `lapply` and a `for` loop:
mycolnames <- colnames(df1)
mycolnames1 <- lapply(names(mylist), function(x) paste0(x, mycolnames))
for(i in 1:length(mylist)){
colnames(mylist[[i]]) <- mycolnames1[[i]]
colnames(mylist[[i]])[1] <- "Var" #put Var back in so you can merge
}
## Merge
merge.all <- function(x, y)
merge(x, y, all=TRUE, by="Var")
out <- Reduce(merge.all, mylist)
out
# Var df1col1 df1col2 df1col3 df2col1 df2col2 df2col3 df19col1 df19col2 df19col3
#1 A 2 3 4 5 6 7 3 6 7
#2 B 3 4 5 5 4 5 7 4 4
There you go - it works but is very ugly.
To set the data frame names unique, you could use a function to set all list names that are not the merging variable to unique names.
resetNames <- function(x, byvar = "Var") {
asrl <- as.relistable(lapply(x, names))
allnm <- names(unlist(x, recursive = FALSE))
rpl <- replace(allnm, unlist(asrl) %in% byvar, byvar)
Map(setNames, x, relist(rpl, asrl))
}
Reduce(merge.all, resetNames(dlist))
# Var list1.col1 list1.col2 list1.col3 list2.col1 list2.col2 list2.col4 list3.col1
#1 A 2 3 4 5 6 7 3
#2 B 3 4 5 5 4 4 7
# list3.col2 list3.col3 list4.col1 list4.col2 list4.col3
#1 6 7 3 6 7
#2 4 4 4 5 6
when run your list with an added data frame there are no warnings. And there's always data table. Its merge method does not return a warning for duplicated column names.
library(data.table)
Reduce(merge.all, lapply(dlist, as.data.table))
Another option is to check the names as the data enters the function, change them there, and then you can avoid the warning. This isn't perfect but it works ok here.
merge.all <- function(x, y) {
m <- match(names(y)[-1], gsub("[.](x|y)$", "", names(x)[-1]), 0L)
names(y)[-1][m] <- paste0(names(y)[-1][m], "DUPE")
merge(x, y, all=TRUE, by="Var")
}
rm <- Reduce(merge.all, dlist)
names(rm)
# [1] "Var" "col1" "col2" "col3" "col1DUPE.x"
# [6] "col2DUPE.x" "col4" "col1DUPE.y" "col2DUPE.y" "col3DUPE.x"
# [11] "col1DUPE" "col2DUPE" "col3DUPE.y"
where dlist is
structure(list(list1 = structure(list(Var = structure(1:2, .Label = c("A",
"B"), class = "factor"), col1 = 2:3, col2 = 3:4, col3 = 4:5), .Names = c("Var",
"col1", "col2", "col3"), class = "data.frame", row.names = c(NA,
-2L)), list2 = structure(list(Var = structure(1:2, .Label = c("A",
"B"), class = "factor"), col1 = c(5L, 5L), col2 = c(6L, 4L),
col4 = c(7L, 4L)), .Names = c("Var", "col1", "col2", "col4"
), class = "data.frame", row.names = c(NA, -2L)), list3 = structure(list(
Var = structure(1:2, .Label = c("A", "B"), class = "factor"),
col1 = c(3L, 7L), col2 = c(6L, 4L), col3 = c(7L, 4L)), .Names = c("Var",
"col1", "col2", "col3"), class = "data.frame", row.names = c(NA,
-2L)), list4 = structure(list(Var = structure(1:2, .Label = c("A",
"B"), class = "factor"), col1 = 3:4, col2 = c(6L, 5L), col3 = c(7L,
6L)), .Names = c("Var", "col1", "col2", "col3"), row.names = c(NA,
-2L), class = "data.frame")), .Names = c("list1", "list2", "list3",
"list4"))

Resources