Create a variable that indicates the source of the data with R - r

I want to create a variable that indicates in which dataframe an observation is located (identified with the variable "code").
I have this database:
id code var1 var2 var9
1 1 a 3 5 4
2 2 b 4 54 5435
3 3 c 44 5 5
4 4 d 5 5 54
5 5 e 6 5 6
6 6 f 6 5 6
And these dataframes:
df1
code var2 var3
1 a 23 4
2 e 45 6
3 k 56 98
df2
code var2 var3
1 b 324 4343
2 z 34 545
3 q 545 6
4 j 77 67
df3
code var2 var3
1 c 1 1
2 l 78 56
df4
code var2 var3
1 d 2 2
2 j 1 1
df5
code var2 var3
1 f 5335 343
My expected result:
id code var1 var2 var9 source
1 a 3 5 4 df1
2 b 4 54 5435 df2
3 c 44 5 5 df3
4 d 5 5 54 df4
5 e 6 5 6 df1
6 f 6 5 6 df5
Data
df <- structure(list(id = 1:6, code = c("a", "b", "c", "d", "e", "f"), var1 = c(3L, 4L, 44L, 5L, 6L, 6L), var2 = c(5L, 54L, 5L, 5L, 5L, 5L), var9 = c(4L, 5435L, 5L, 54L, 6L, 6L)), class = "data.frame", row.names = c(NA, -6L))
df1 <- structure(list(code = c("a", "e", "k"), var2 = c(23L, 45L, 56L), var3 = c(4L, 6L, 98L)), class = "data.frame", row.names = c(NA, -3L))
df2 <- structure(list(code = c("b", "z", "q", "j"), var2 = c(324L, 34L, 545L, 77L), var3 = c(4343L, 545L, 6L, 67L)), class = "data.frame", row.names = c(NA, -4L))
df3 <- structure(list(code = c("c", "l"), var2 = c(1L, 78L), var3 = c(1L, 56L)), class = "data.frame", row.names = c(NA, -2L))
df4 <- structure(list(code = c("d", "j"), var2 = 2:1, var3 = 2:1), class = "data.frame", row.names = c(NA, -2L))
df5 <- structure(list(code = "f", var2 = 5335L, var3 = 343L), class = "data.frame", row.names = c(NA, -1L))

You can use bind_rows from dplyr:
library(dplyr)
bind_rows(df1 = df1, df2 = df2, df3 = df3, df4 = df4, df5 = df5, .id = 'source')
#> source code var2 var3
#> 1 df1 a 23 4
#> 2 df1 e 45 6
#> 3 df1 k 56 98
#> 4 df2 b 324 4343
#> 5 df2 z 34 545
#> 6 df2 q 545 6
#> 7 df2 j 77 67
#> 8 df3 c 1 1
#> 9 df3 l 78 56
#> 10 df4 d 2 2
#> 11 df4 j 1 1
#> 12 df5 f 5335 343

Related

How to move two specific rows to top of dataframe?

Below I have a DF.
A B C D
a 4 2 2
g 5 2 2
d 7 65 7
e 3 6 7
I would like to make this DF so that column A has "g" in the first row, and "d" in the second row. I would like to do this by calling the value in column A (rather than an index). How can I do this?
Ideal output
A B C D
g 5 2 2
d 7 65 7
a 4 2 2
e 3 6 7
We may convert to factor with levels specified in an order before arrangeing
library(forcats)
library(dplyr)
DF %>%
arrange(fct_relevel(A, 'g', 'd'))
A B C D
1 g 5 2 2
2 d 7 65 7
3 a 4 2 2
4 e 3 6 7
with fct_relevel, we can specify the order of specific levels without specifying the rest of the levels
> with(DF, fct_relevel(A, 'g', 'd'))
[1] a g d e
Levels: g d a e
data
DF <- structure(list(A = c("a", "g", "d", "e"), B = c(4L, 5L, 7L, 3L
), C = c(2L, 2L, 65L, 6L), D = c(2L, 2L, 7L, 7L)), class = "data.frame",
row.names = c(NA,
-4L))
Another possible solution:
library(dplyr)
df <- data.frame(
stringsAsFactors = FALSE,
A = c("a", "g", "d", "e"),
B = c(4L, 5L, 7L, 3L),
C = c(2L, 2L, 65L, 6L),
D = c(2L, 2L, 7L, 7L)
)
df %>% arrange(match(A, c("g", "d", setdiff(c("g", "d"), A))))
#> A B C D
#> 1 g 5 2 2
#> 2 d 7 65 7
#> 3 a 4 2 2
#> 4 e 3 6 7
Try the code below
with(
df,
df[match(c("g","d",A[!A%in%c("g","d")]),A),]
)
and you will see
A B C D
2 g 5 2 2
3 d 7 65 7
1 a 4 2 2
4 e 3 6 7
Just to add a base R solution if you are not interested in external packages, you can specify the row order directly:
# Sample Data
DF <- structure(list(A = c("a", "g", "d", "e"), B = c(4L, 5L, 7L, 3L
), C = c(2L, 2L, 65L, 6L), D = c(2L, 2L, 7L, 7L)), class = "data.frame",
row.names = c(NA, -4L))
A hard code for this example:
DF2 <- DF[c(2,3,1,4),]
A more generalizable example:
# specify desired rows
rownums <- which(DF$A %in% c("g","d"), arr.ind = TRUE)
# Specify other rows
otherrows <- seq(1:nrow(DF))[!(seq(1:nrow(DF)) %in% rownums)]
# Organize
DF2 <- DF[c(rownums,otherrows),]

Merge 2 data frames by row and column overlap

I would like to merge 2 data frames additively such that
taxonomy A B C
1 rat 0 1 2
2 dog 1 2 3
3 cat 2 3 0
and
taxonomy A D C
1 rat 0 1 9
2 Horse 0 2 6
3 cat 2 0 2
produce
taxonomy A B C D
1 rat 0 1 11 1
2 Horse 0 0 6 2
3 cat 4 3 2 0
4 dog 1 2 3 0
I've tried aggregate, merge, apply, ddply.... with no success...this will be done on 2 data frames with a couple hundred rows and columns
With bind_rows from dplyr:
library(dplyr)
bind_rows(df1, df2) %>%
group_by(taxonomy) %>%
summarize_all(sum, na.rm = TRUE)
Output:
# A tibble: 4 x 5
taxonomy A B C D
<chr> <int> <int> <int> <int>
1 cat 4 3 2 0
2 dog 1 2 3 0
3 Horse 0 0 6 2
4 rat 0 1 11 1
Data:
df1 <- structure(list(taxonomy = c("rat", "dog", "cat"), A = 0:2, B = 1:3,
C = c(2L, 3L, 0L)), .Names = c("taxonomy", "A", "B", "C"), class = "data.frame", row.names = c("1",
"2", "3"))
df2 <- structure(list(taxonomy = c("rat", "Horse", "cat"), A = c(0L,
0L, 2L), D = c(1L, 2L, 0L), C = c(9L, 6L, 2L)), .Names = c("taxonomy",
"A", "D", "C"), class = "data.frame", row.names = c("1", "2",
"3"))
The data.table equivalent of #avid_useR's answer.
library(data.table)
rbindlist(list(df1, df2), fill = TRUE)[, lapply(.SD, sum, na.rm = TRUE), by = taxonomy]
# taxonomy A B C D
#1: rat 0 1 11 1
#2: dog 1 2 3 0
#3: cat 4 3 2 0
#4: Horse 0 0 6 2
You can do...
> library(reshape2)
> dcast(rbind(melt(DF1), melt(DF2)), taxonomy ~ variable, fun.aggregate = sum)
Using taxonomy as id variables
Using taxonomy as id variables
taxonomy A B C D
1 cat 4 3 2 0
2 dog 1 2 3 0
3 Horse 0 0 6 2
4 rat 0 1 11 1
This sorts the rows and columns alphabetically, but I guess this might be avoidable by using a factor.
Data:
DF1 = structure(list(taxonomy = c("rat", "dog", "cat"), A = 0:2, B = 1:3,
C = c(2L, 3L, 0L)), .Names = c("taxonomy", "A", "B", "C"), row.names = c(NA,
-3L), class = "data.frame")
DF2 = structure(list(taxonomy = c("rat", "Horse", "cat"), A = c(0L,
0L, 2L), D = c(1L, 2L, 0L), C = c(9L, 6L, 2L)), .Names = c("taxonomy",
"A", "D", "C"), row.names = c(NA, -3L), class = "data.frame")

How to sort rows of a data frame based on a vector using dplyr pipe

I have the following data frame:
library(tidyverse)
dat <- structure(list(var1 = c(1L, 2L, 2L, 3L, 1L), var2 = structure(c(10L,
1L, 8L, 3L, 5L), .Label = c("b", "c", "f", "h", "i", "o", "s",
"t", "w", "x"), class = "factor"), var3 = c(7L, 5L, 5L, 8L, 5L
), var4 = structure(c(8L, 5L, 1L, 4L, 7L), .Label = c("b", "c",
"d", "e", "f", "h", "i", "w", "y"), class = "factor")), .Names = c("var1",
"var2", "var3", "var4"), row.names = c(NA, 5L), class = "data.frame")
dat
#> var1 var2 var3 var4
#> 1 1 x 7 w
#> 2 2 b 5 f
#> 3 2 t 5 b
#> 4 3 f 8 e
#> 5 1 i 5 i
What I want to do is to sort/arrange the var2 column
based on a predefined order:
my_order <- c('t','f','x','b','i')
The final desired result is this:
var1 var2 var3 var4
2 t 5 b
3 f 8 e
1 x 7 w
2 b 5 f
1 i 5 i
I'd like to do it under dplyr piping. How can I achieve that?
At best I can do is this:
> dat %>%
+ arrange(var2)
var1 var2 var3 var4
1 2 t 5 b
2 3 f 8 e
3 1 x 7 w
4 2 b 5 f
5 1 i 5 i
We can use arrange with match
library(dplyr)
dat %>%
arrange(match(var2, my_order))
# var1 var2 var3 var4
#1 2 t 5 b
#2 3 f 8 e
#3 1 x 7 w
#4 2 b 5 f
#5 1 i 5 i
We can convert the column to a factor with levels specified as 'my_order' (but it doesn't change the type of the actual column)
library(dplyr)
dat %>%
arrange(factor(var2, levels = my_order))
# var1 var2 var3 var4
#1 2 t 5 b
#2 3 f 8 e
#3 1 x 7 w
#4 2 b 5 f
#5 1 i 5 i

merge and get max value from two different datatables in R

I've 2 different data.tables. I need to merge and get max value based on a row values. The examples of two tables are given as Input below and expected output shown below.
Input
Table 1
X A B
A 3
B 4 6
C 5
D 9 12
Table 2
X A B
A 1 5
B 6 8
C 7 14
D 5
E 1 1
F 2 3
G 5 6
Expected Output:
X A B
A 3 5
B 6 8
C 7 14
D 9 12
E 1 1
F 2 3
G 5 6
We can rbind the two datasets and do a group by max
library(data.table)
rbindlist(list(tbl1, tbl2))[, lapply(.SD, max, na.rm = TRUE), X]
# X A B
#1: A 3 5
#2: B 6 8
#3: C 7 14
#4: D 9 12
#5: E 1 1
#6: F 2 3
#7: G 5 6
If we are using base R, then use aggregate after rbinding the datasets
aggregate(.~ X, rbind(tbl1, tbl2), max, na.rm = TRUE, na.action = NULL)
NOTE: Assume that the 'A', 'B' columns are numeric and blanks are NA
data
tbl1 <- structure(list(X = c("A", "B", "C", "D"), A = c(3L, 4L, 5L, 9L
), B = c(NA, 6L, NA, 12L)), .Names = c("X", "A", "B"), class = "data.frame",
row.names = c(NA, -4L))
tbl2 <- structure(list(X = c("A", "B", "C", "D", "E", "F", "G"), A = c(1L,
6L, 7L, 5L, 1L, 2L, 5L), B = c(5L, 8L, 14L, NA, 1L, 3L, 6L)), .Names = c("X",
"A", "B"), class = "data.frame",
row.names = c(NA, -7L))

How to intersect values from two data frames with R

I would like to create a new column for a data frame with values from the intersection of a row and a column.
I have a data.frame called "time":
q 1 2 3 4 5
a 1 13 43 5 3
b 2 21 12 3353 34
c 3 21 312 123 343
d 4 123 213 123 35
e 4556 11 123 12 3
And another table, called "event":
q dt
a 1
b 3
c 4
d 2
e 1
I want to put another column called inter on the second table that will be fill the values that are in the intersection between the q and the columns dt from the first data.frame. So the result would be this:
q dt inter
a 1 1
b 3 12
c 4 123
d 2 123
e 1 4556
I have tried to use merge(event, time, by.x = "q", by.y = "dt"), but it generate the error that they aren't the same id. I have also tried to transpose the time data.frame to cross section the values but I didn't have success.
library(reshape2)
merge(event, melt(time, id.vars = "q"),
by.x=c('q','dt'), by.y=c('q','variable'), all.x = TRUE)
Output:
q dt value
1 a 1 1
2 b 3 12
3 c 4 123
4 d 2 123
5 e 1 4556
Notes
We use the function melt from the package reshape2 to convert the data frame time from wide to long format. And then we merge (left outer join) the data frames event and the melted time by two columns (q and dt in event, q and variable in the melted time) .
Data:
time <- structure(list(q = structure(1:5, .Label = c("a", "b", "c", "d",
"e"), class = "factor"), `1` = c(1L, 2L, 3L, 4L, 4556L), `2` = c(13L,
21L, 21L, 123L, 11L), `3` = c(43L, 12L, 312L, 213L, 123L), `4` = c(5L,
3353L, 123L, 123L, 12L), `5` = c(3L, 34L, 343L, 35L, 3L)), .Names = c("q",
"1", "2", "3", "4", "5"), class = "data.frame", row.names = c(NA,
-5L))
event <- structure(list(q = structure(1:5, .Label = c("a", "b", "c", "d",
"e"), class = "factor"), dt = c(1L, 3L, 4L, 2L, 1L)), .Names = c("q",
"dt"), class = "data.frame", row.names = c(NA, -5L))
This may be a little clunky but it works:
inter=c()
for (i in 1:nrow(time)) {
xx=merge(time,event,by='q')
dt=xx$dt
z=y[i,dt[i]+1]
inter=c(inter,z)
final=cbind(time[,1],dt,inter)
}
colnames(final)=c('q','dt','inter')
Hope it helps.
Output:
q dt inter
[1,] 1 1 1
[2,] 2 3 12
[3,] 3 4 123
[4,] 4 2 123
[5,] 5 1 4556

Resources