Merge 2 data frames by row and column overlap - r

I would like to merge 2 data frames additively such that
taxonomy A B C
1 rat 0 1 2
2 dog 1 2 3
3 cat 2 3 0
and
taxonomy A D C
1 rat 0 1 9
2 Horse 0 2 6
3 cat 2 0 2
produce
taxonomy A B C D
1 rat 0 1 11 1
2 Horse 0 0 6 2
3 cat 4 3 2 0
4 dog 1 2 3 0
I've tried aggregate, merge, apply, ddply.... with no success...this will be done on 2 data frames with a couple hundred rows and columns

With bind_rows from dplyr:
library(dplyr)
bind_rows(df1, df2) %>%
group_by(taxonomy) %>%
summarize_all(sum, na.rm = TRUE)
Output:
# A tibble: 4 x 5
taxonomy A B C D
<chr> <int> <int> <int> <int>
1 cat 4 3 2 0
2 dog 1 2 3 0
3 Horse 0 0 6 2
4 rat 0 1 11 1
Data:
df1 <- structure(list(taxonomy = c("rat", "dog", "cat"), A = 0:2, B = 1:3,
C = c(2L, 3L, 0L)), .Names = c("taxonomy", "A", "B", "C"), class = "data.frame", row.names = c("1",
"2", "3"))
df2 <- structure(list(taxonomy = c("rat", "Horse", "cat"), A = c(0L,
0L, 2L), D = c(1L, 2L, 0L), C = c(9L, 6L, 2L)), .Names = c("taxonomy",
"A", "D", "C"), class = "data.frame", row.names = c("1", "2",
"3"))

The data.table equivalent of #avid_useR's answer.
library(data.table)
rbindlist(list(df1, df2), fill = TRUE)[, lapply(.SD, sum, na.rm = TRUE), by = taxonomy]
# taxonomy A B C D
#1: rat 0 1 11 1
#2: dog 1 2 3 0
#3: cat 4 3 2 0
#4: Horse 0 0 6 2

You can do...
> library(reshape2)
> dcast(rbind(melt(DF1), melt(DF2)), taxonomy ~ variable, fun.aggregate = sum)
Using taxonomy as id variables
Using taxonomy as id variables
taxonomy A B C D
1 cat 4 3 2 0
2 dog 1 2 3 0
3 Horse 0 0 6 2
4 rat 0 1 11 1
This sorts the rows and columns alphabetically, but I guess this might be avoidable by using a factor.
Data:
DF1 = structure(list(taxonomy = c("rat", "dog", "cat"), A = 0:2, B = 1:3,
C = c(2L, 3L, 0L)), .Names = c("taxonomy", "A", "B", "C"), row.names = c(NA,
-3L), class = "data.frame")
DF2 = structure(list(taxonomy = c("rat", "Horse", "cat"), A = c(0L,
0L, 2L), D = c(1L, 2L, 0L), C = c(9L, 6L, 2L)), .Names = c("taxonomy",
"A", "D", "C"), row.names = c(NA, -3L), class = "data.frame")

Related

Two step dataframe rearrange in R

I import a csv into a dataframe with this structure:
id brand p_1 p_2 p_3 p_4 p_5
1 A 1 2 5
2 B 2 3
3 C 3
4 B 1
5 A 2
And I would like to first get it into this structure
p A B C
1 1 1 0
2 2 1 0
3 0 1 1
4 0 0 0
5 1 0 0
So it counts all combinations of values BUT is also counts non existing ones such as 4 that does no appear YET is a value among 1 (min) and 5 (max), and this is the tricky part!
Thanks!
df %>%
pivot_longer(-(1:2)) %>%
filter(!is.na(value)) %>%
count(value, brand) %>%
complete(value = 1:5, brand, fill = list(n=0)) %>%
pivot_wider(names_from = brand, values_from = n, values_fill = 0)
result
# A tibble: 5 × 4
value A B C
<int> <int> <int> <int>
1 1 1 1 0
2 2 2 1 0
3 3 0 1 1
4 4 0 0 0
5 5 1 0 0
source data
df <- data.frame(
stringsAsFactors = FALSE,
id = c(1L, 2L, 3L, 4L, 5L),
brand = c("A", "B", "C", "B", "A"),
p_1 = c(1L, 2L, 3L, 1L, 2L),
p_2 = c(2L, 3L, NA, NA, NA),
p_3 = c(5L, NA, NA, NA, NA),
p_4 = c(NA, NA, NA, NA, NA),
p_5 = c(NA, NA, NA, NA, NA)
)

How to create multiple columns in r dataframe by implementing some query conditions

I have a dataset which is similar to the following:
Age Food_1_1 Food_1_2 Food_1_3 Amount_1_1 Amount_1_2 Amount_1_3
6-9 a b a 2 3 4
6-9 b b c 1 2 3
6-9 c a 4 1
9-10 c c b 1 3 1
9-10 c a b 1 2 1
Using R, I want to get the following data set which contains new set of columns a, b and c by adding the corresponding values:
Age Food_1_1 Food_1_2 Food_1_3 Amount_1_1 Amount_1_2 Amount_1_3 a b c
6-9 a b a 2 3 4 6 3 0
6-9 b b c 1 2 3 0 3 3
6-9 c a 4 1 1 0 4
9-10 c c b 1 3 1 0 1 4
9-10 c a b 1 2 1 2 1 1
Note: My data also contains missing values. The variables Monday:Wednesday are factors and the variables Value1:Value3 are numeric. For more clearity: 1st row of column "a" contains the addition of all values through Value1 to Value3 related to a (say for example 2+4 =6).
One way using base R:
data$id <- 1:nrow(data) # Create a unique id
vlist <- list(grep("day$", names(data)), grep("^Value", names(data)))
d1 <- reshape(data, direction="long", varying=vlist, v.names=c("Day","Value"))
d2 <- aggregate(Value~id+Day, FUN=sum, na.rm=TRUE, data=d1)
d3 <- reshape(d2, direction="wide", v.names="Value", timevar="Day")
d3[is.na(d3)] <- 0
merge(data, d3, by="id", all.x=TRUE)
# id Age Monday Tuesday Wednesday Value1 Value2 Value3 Value.a Value.b Value.c
#1 1 6-9 a b a 2 3 4 6 3 0
#2 2 6-9 b b c 1 2 3 0 3 3
#3 3 6-9 <NA> c a NA 4 1 1 0 4
#4 4 9-10 c c b 1 3 1 0 1 4
#5 5 9-10 c a b 1 2 1 2 1 1
Data:
data <- structure(list(Age = structure(c(1L, 1L, 1L, 2L, 2L), .Label = c("6-9",
"9-10"), class = "factor"), Monday = structure(c(1L, 2L, NA,
3L, 3L), .Label = c("a", "b", "c"), class = "factor"), Tuesday = structure(c(2L,
2L, 3L, 3L, 1L), .Label = c("a", "b", "c"), class = "factor"),
Wednesday = structure(c(1L, 3L, 1L, 2L, 2L), .Label = c("a",
"b", "c"), class = "factor"), Value1 = c(2L, 1L, NA, 1L,
1L), Value2 = c(3L, 2L, 4L, 3L, 2L), Value3 = c(4L, 3L, 1L,
1L, 1L)), class = "data.frame", row.names = c(NA, -5L))
You can use below code:
data[] <- lapply(data, as.character)
data$rownumber<-rownames(data)
x<-gather(data[,c(1:4,8)], Day, Letter, Monday:Wednesday) %>% mutate(row2 = rownames(x))
y<-gather(data[,c(1,5:7,8)], Day, Value, Value1:Value3)%>% mutate(row2 = rownames(y))
z<-left_join(x, y, by =c("Age","rownumber", "row2")) %>% group_by(Age, rownumber, Letter) %>% dplyr::summarise(suma = sum(as.numeric(Value), na.rm = T)) %>% mutate(suma = replace_na(suma, 0))
z<-dcast(z, rownumber ~ Letter , value.var="suma") %>% left_join(data, z, by = "rownumber")
z$Var.2<-NULL
z[is.na(z)]<-0
Output:
rownumber a b c Age Monday Tuesday Wednesday Value1 Value2 Value3
1 1 6 3 0 6-9 a b a 2 3 4
2 2 0 3 3 6-9 b b c 1 2 3
3 3 1 0 4 6-9 c a 0 4 1
4 4 0 1 4 9-10 c c b 1 3 1
5 5 2 1 1 9-10 c a b 1 2 1

Create combination of all variables within a group

I have a dataframe as follows
group x y
a 1 2
a 3 1
b 1 3
c 1 1
c 2 3
I want to be able to generate all combinations of the x and y columns within a group, like so
group xy
a 1-2
a 1-1
a 3-2
a 3-1
b 1-3
c 1-1
c 1-3
c 2-1
c 2-3
I've tried using the following code, but it seems as though the group_by function is not working as expected
library(dplyr)
library(tidyr)
combn <- df %>%
group_by(group) %>%
expand(x, y)
My current results are instead giving me every combination of all three columns
head(combn)
group x y
a 1 1
a 1 2
a 1 3
a 2 1
a 2 2
a 2 3
Dput:
structure(list(group = structure(c(1L, 1L, 2L, 3L, 3L), .Label = c("a",
"b", "c"), class = "factor"), x = structure(c(1L, 3L, 1L, 1L,
2L), .Label = c("1", "2", "3"), class = "factor"), y = structure(c(2L,
1L, 3L, 1L, 3L), .Label = c("1", "2", "3"), class = "factor")), class = "data.frame", row.names = c(NA,
-5L))
You could use crossing from purrr to create combinations within a group and then unnest to create them as separate rows.
library(dplyr)
df1 <- df %>%
group_by(group) %>%
summarise(xy = list(crossing(x, y))) %>%
tidyr::unnest(xy)
df1
# group a b
# <fct> <int> <int>
#1 a 1 2
#2 a 3 2
#3 a 1 1
#4 a 3 1
#5 b 1 3
#6 c 1 1
#7 c 2 1
#8 c 1 3
#9 c 2 3
If you want to combine the two columns, you could use unite :
tidyr::unite(df1, xy, a, b, sep = "-")
# group xy
# <fct> <chr>
#1 a 1-2
#2 a 3-2
#3 a 1-1
#4 a 3-1
#5 b 1-3
#6 c 1-1
#7 c 2-1
#8 c 1-3
#9 c 2-3

R aggregate() function: Sum and show missing values = 0

I want to sum the "value" column by group1 and by group2.
group2 can range from 1 to 5.
If there is no entry for group2, the sum should be 0.
Data:
group1 group2 value
a 1 100
a 2 200
a 3 300
b 1 10
b 2 20
I am using
aggregate(data$value, by=(list(data$group1, data$group2)), FUN = sum)
which gives
group1 group2 value
a 1 100
a 2 200
a 3 300
b 1 10
b 2 20
However, the result should look like
group1 group2 value
a 1 100
a 2 200
a 3 300
a 4 0
a 5 0
b 1 10
b 2 20
b 3 0
b 4 0
b 5 0
How can i address this using the aggregate function in R?
Thank you!
We can use complete from tidyr to complete missing combinations.
library(dplyr)
library(tidyr)
df %>%
group_by(group1, group2) %>%
summarise(value = sum(value)) %>%
complete(group2 = 1:5, fill = list(value = 0))
# group1 group2 value
# <fct> <int> <dbl>
# 1 a 1 100
# 2 a 2 200
# 3 a 3 300
# 4 a 4 0
# 5 a 5 0
# 6 b 1 10
# 7 b 2 20
# 8 b 3 0
# 9 b 4 0
#10 b 5 0
data
df <- structure(list(group1 = structure(c(1L, 1L, 1L, 2L, 2L), .Label = c("a",
"b"), class = "factor"), group2 = c(1L, 2L, 3L, 1L, 2L), value = c(100L,
200L, 300L, 10L, 20L)), class = "data.frame", row.names = c(NA, -5L))
You need of course to tell R that "group 2 can range from 1 to 5". Best you merge it with an expand.grid accordingly and use with.
with(merge(expand.grid(group1=c("a", "b"), group2=1:5, value=0), data, all=TRUE),
aggregate(value, by=(list(group1, group2)), FUN=sum))
# Group.1 Group.2 x
# 1 a 1 100
# 2 b 1 10
# 3 a 2 200
# 4 b 2 20
# 5 a 3 300
# 6 b 3 0
# 7 a 4 0
# 8 b 4 0
# 9 a 5 0
# 10 b 5 0
Data:
data <- structure(list(group1 = c("a", "a", "a", "b", "b"), group2 = c(1L,
2L, 3L, 1L, 2L), value = c(100L, 200L, 300L, 10L, 20L)), row.names = c(NA,
-5L), class = "data.frame")

R Data Frame remove rows with max values from all columns

Hello I have the data frame and I need to remove all the rows with max values from each columns.
Example
A B C
1 2 3 5
2 4 1 1
3 1 4 3
4 2 1 1
So the output is:
A B C
4 2 1 1
Is there any quick way to do this?
We can do this with %in%
df1[!seq_len(nrow(df1)) %in% sapply(df1, which.max),]
# A B C
#4 2 1 1
If there are ties for maximum values in each row, then do
df1[!Reduce(`|`, lapply(df1, function(x) x== max(x))),]
df[-sapply(df, which.max),]
# A B C
#4 2 1 1
DATA
df = structure(list(A = c(2L, 4L, 1L, 2L), B = c(3L, 1L, 4L, 1L),
C = c(5L, 1L, 3L, 1L)), .Names = c("A", "B", "C"),
class = "data.frame", row.names = c(NA,-4L))

Resources