Get new columns based on data from other columns - r

My data:
data <- structure(list(col1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), col2 = c(0L, 1L, 1L, 0L, 0L,
1L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-18L))
I want to get 2 new columns based on col1 and col2.
column 3 is obtained: We leave units if there is zero in the second column, 2 are simply transferred.
column 4 will turn out: We leave units if there is one in the second column, 2 are simply transferred.
What I want to get:
data <- structure(list(col1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), col2 = c(0L, 1L, 1L, 0L, 0L,
1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), group1 = c(1L,
NA, NA, 1L, 1L, NA, 1L, NA, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), group2 = c(NA, 1L, 1L, NA, NA, 1L, NA, 1L, NA, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L)), class = "data.frame", row.names = c(NA,
-18L))

A solution that uses tidyr::pivot_wider():
library(dplyr)
data %>%
mutate(id = 1:n(), name = paste0("group", col2 + 1), value = 1) %>%
tidyr::pivot_wider() %>%
mutate(col2 = replace(col2, col1 == 2, 0),
across(starts_with("group"), replace, col1 == 2, 2)) %>%
select(-id)
# A tibble: 18 x 4
col1 col2 group1 group2
<int> <dbl> <dbl> <dbl>
1 1 0 1 NA
2 1 1 NA 1
3 1 1 NA 1
4 1 0 1 NA
5 1 0 1 NA
6 1 1 NA 1
7 1 0 1 NA
8 1 1 NA 1
9 1 0 1 NA
10 2 0 2 2
11 2 0 2 2
12 2 0 2 2
13 2 0 2 2
14 2 0 2 2
15 2 0 2 2
16 2 0 2 2
17 2 0 2 2
18 2 0 2 2

You can use ifelse to get group1 and group2.
transform(data
, group1 = ifelse(col1==2, 2, ifelse(col2==0, 1, NA))
, group2 = ifelse(col1==2, 2, ifelse(col2==1, 1, NA))
)
# col1 col2 group1 group2
#1 1 0 1 NA
#2 1 1 NA 1
#3 1 1 NA 1
#4 1 0 1 NA
#5 1 0 1 NA
#6 1 1 NA 1
#7 1 0 1 NA
#8 1 1 NA 1
#9 1 0 1 NA
#10 2 0 2 2
#11 2 1 2 2
#12 2 1 2 2
#13 2 0 2 2
#14 2 0 2 2
#15 2 1 2 2
#16 2 0 2 2
#17 2 1 2 2
#18 2 0 2 2

Related

How to convert 0 across specified columns to NAs

I have a dataframe:
dat <- data.frame(col1 = sample(0:3, 10, replace = TRUE),
col2 = sample(0:3, 10, replace = TRUE),
col3 = sample(0:3, 10, replace = TRUE),
col4 = sample(0:3, 10, replace = TRUE))
I want to convert the 0s in col1, col2 and col4 to NAs. How can I do that? Most examples do the other way around.
Thanks!
In base R:
dat[-3][dat[-3] == 0] <- NA
#or
replace(dat[-3], dat[-3] == 0, NA)
You can use across to ignore col3 and use na_if to replace 0 with NA.
library(dplyr)
dat %>% mutate(across(-col3, na_if, 0))
col1 col2 col3 col4
1 NA 2 1 3
2 1 NA 0 2
3 NA 1 3 2
4 2 NA 1 3
5 1 1 1 2
6 2 1 0 NA
7 2 3 0 1
8 1 2 0 2
9 3 1 0 3
10 3 2 0 1
Data
dat <- structure(list(col1 = c(0L, 1L, 0L, 2L, 1L, 2L, 2L, 1L, 3L, 3L
), col2 = c(2L, 0L, 1L, 0L, 1L, 1L, 3L, 2L, 1L, 2L), col3 = c(1L,
0L, 3L, 1L, 1L, 0L, 0L, 0L, 0L, 0L), col4 = c(3L, 2L, 2L, 3L,
2L, 0L, 1L, 2L, 3L, 1L)), class = "data.frame", row.names = c(NA,
-10L))

How to do multiple arithmetic operations in R by group

have several datasets. The first one
lid=structure(list(x1 = 619490L, x2 = 10L, x3 = 0L, x4 = 6089230L,
x5 = 0L, x6 = -10L), class = "data.frame", row.names = c(NA,
-1L))
second dataset
lidar=structure(list(A = c(638238.76, 638238.76, 638239.29, 638235.39,
638233.86, 638233.86, 638235.55, 638231.97, 638231.91, 638228.41,
638238.76, 638238.76, 63239.29, 638235.39, 638233.86, 638233.86,
638235.55, 638231.97, 638231.91, 638228.41), B = c(6078001.09,
6078001.09, 6078001.15, 6078001.15, 6078001.07, 6078001.07, 6078001.02,
6078001.08, 6078001.09, 6078001.01, 6078001.09, 6078001.09, 6078001.15,
6078001.15, 6078001.07, 6078001.07, 6078001.02, 6078001.08, 6078001.09,
6078001.01), C = c(186.64, 186.59, 199.28, 189.37, 186.67, 186.67,
198.04, 200.03, 199.73, 192.14, 186.64, 186.59, 199.28, 189.37,
196.67, 186.67, 198.04, 200.03, 199.73, 100.14), gpstime = c(319805734.664265,
319805734.664265, 319805734.67875, 319805734.678768, 319805734.678777,
319805734.678777, 319805734.687338, 319805734.701928, 319805734.701928,
319805734.701945, 319805734.664265, 319805734.664265, 319805734.67875,
319805734.678768, 319805734.678777, 319805734.678777, 319805734.687338,
319805734.701928, 319805734.701928, 319805734.701945), Intensity = c(13L,
99L, 5L, 2L, 20L, 189L, 2L, 11L, 90L, 1L, 13L, 99L, 5L, 2L, 20L,
189L, 2L, 11L, 90L, 1L), ReturnNumber = c(2L, 1L, 1L, 2L, 1L,
1L, 2L, 1L, 1L, 3L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 3L),
NumberOfReturns = c(2L, 1L, 3L, 2L, 1L, 1L, 3L, 1L, 1L, 4L,
2L, 1L, 3L, 2L, 1L, 1L, 3L, 1L, 1L, 4L), ScanDirectionFlag = c(1L,
1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L), EdgeOfFlightline = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L
), Classification = c(1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), group = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L)), class = "data.frame", row.names = c(NA,
-20L))
lid static dataset, it always same(always the same values).
I must perform these arithmetic operations
` lidar$row <- round((lidar$A-lid$x1)/lid$x3, 0)
lidar$col <- (lidar$B-lid$x4)/lid$x6
lidar$cdif <- max(lidar$C)-min(lidar$C)
but for each lidar$groups separately.
How can i do it better using dplyr?
Thanks for your help.
library(dplyr)
lidar %>%
group_by(group) %>%
mutate(
row = (A-lid$x1)/lid$x3,
col = (B-lid$x4)/lid$x6,
cdif = max(C)-min(C)
) %>%
ungroup()
# # A tibble: 20 x 14
# A B C gpstime Intensity ReturnNumber NumberOfReturns ScanDirectionFlag EdgeOfFlightline Classification group row col cdif
# <dbl> <dbl> <dbl> <dbl> <int> <int> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl>
# 1 638239. 6078001. 187. 319805735. 13 2 2 1 0 1 1 Inf 1123. 13.4
# 2 638239. 6078001. 187. 319805735. 99 1 1 1 0 2 1 Inf 1123. 13.4
# 3 638239. 6078001. 199. 319805735. 5 1 3 0 0 1 1 Inf 1123. 13.4
# 4 638235. 6078001. 189. 319805735. 2 2 2 0 0 1 1 Inf 1123. 13.4
# 5 638234. 6078001. 187. 319805735. 20 1 1 0 0 1 1 Inf 1123. 13.4
# 6 638234. 6078001. 187. 319805735. 189 1 1 0 0 1 1 Inf 1123. 13.4
# 7 638236. 6078001. 198. 319805735. 2 2 3 1 0 1 1 Inf 1123. 13.4
# 8 638232. 6078001. 200. 319805735. 11 1 1 0 0 1 1 Inf 1123. 13.4
# 9 638232. 6078001. 200. 319805735. 90 1 1 0 0 1 1 Inf 1123. 13.4
# 10 638228. 6078001. 192. 319805735. 1 3 4 0 0 1 1 Inf 1123. 13.4
# 11 638239. 6078001. 187. 319805735. 13 2 2 1 0 1 2 Inf 1123. 99.9
# 12 638239. 6078001. 187. 319805735. 99 1 1 1 0 2 2 Inf 1123. 99.9
# 13 63239. 6078001. 199. 319805735. 5 1 3 0 0 1 2 -Inf 1123. 99.9
# 14 638235. 6078001. 189. 319805735. 2 2 2 0 0 1 2 Inf 1123. 99.9
# 15 638234. 6078001. 197. 319805735. 20 1 1 0 0 1 2 Inf 1123. 99.9
# 16 638234. 6078001. 187. 319805735. 189 1 1 0 0 1 2 Inf 1123. 99.9
# 17 638236. 6078001. 198. 319805735. 2 2 3 1 0 1 2 Inf 1123. 99.9
# 18 638232. 6078001. 200. 319805735. 11 1 1 0 0 1 2 Inf 1123. 99.9
# 19 638232. 6078001. 200. 319805735. 90 1 1 0 0 1 2 Inf 1123. 99.9
# 20 638228. 6078001. 100. 319805735. 1 3 4 0 0 1 2 Inf 1123. 99.9
row is always Inf because lid$x3 is 0. The only part of this that must be grouped is xdif, since it's the only thing that does any groupwise aggregation, the rest can be done ungrouped.
lidar %>%
mutate(
row = (A-lid$x1)/lid$x3,
col = (B-lid$x4)/lid$x6
) %>%
group_by(group) %>%
mutate(cdif = max(C)-min(C)) %>%
ungroup()
Why would one do it this way? With larger datasets or with a lot of groups, it will be more efficient (perhaps perceptibly faster) to do the whole vector at once instead of per-group. The actual calculations should return identical results.

How to find the statistical mode of each ID

Here are the observations of two individuals of my dataset.
data=structure(list(id = c(2L, 2L, 2L, 3L, 3L, 3L), trt = c(1L, 1L,
1L, 1L, 1L, 1L), status = c(0L, 0L, 0L, 2L, 2L, 2L), stage = c(3L,
3L, 3L, 4L, 4L, 4L), spiders = c(1L, 1L, 1L, 0L, 1L, 0L), sex = structure(c(2L,
2L, 2L, 1L, 1L, 1L), .Label = c("m", "f"), class = "factor"),
hepato = c(1L, 1L, 1L, 0L, 1L, 0L), edema = c(0, 0, 0, 0.5,
0, 0.5), ascites = c(0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(NA,
-6L), class = "data.frame")
I want to calculate the the statistical mode for each individual after grouping by id. I used this code below:
library(dplyr)
library(modeest)
data%>%
group_by(id)%>%mutate(edema2=mlv(edema))
And I get an error message when calculating the mode, while this method work well with other statistical parameters such as mean, sd, min, max....
The warnings that you are getting are suggesting two things.
You have not specified what method to choose so default method 'shorth' is used.
It is suggesting that there is a tie in selection of Mode value.
Alternatively, why not use the Mode function from here :
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
To apply by group you can use it with dplyr as :
library(dplyr)
data%>% group_by(id)%>% mutate(edema2= Mode(edema))
# id trt status stage spiders sex hepato edema ascites edema2
# <int> <int> <int> <int> <int> <fct> <int> <dbl> <int> <dbl>
#1 2 1 0 3 1 f 1 0 0 0
#2 2 1 0 3 1 f 1 0 0 0
#3 2 1 0 3 1 f 1 0 0 0
#4 3 1 2 4 0 m 0 0.5 0 0.5
#5 3 1 2 4 1 m 1 0 0 0.5
#6 3 1 2 4 0 m 0 0.5 0 0.5

Grouping a dataframe into matrices based on a variable and transposing

Here is some mock data related to this problem:
structure(list(HHID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 4L, 4L, 4L, 4L, 4L), PERS = c(1L, 2L, 3L, 4L, 5L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 1L, 2L, 3L, 4L, 5L), MARSTAT = c(2L,
2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 5L, 1L, 1L
), SEX = c(1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L,
1L, 2L, 2L, 1L), VAR1 = c(NA, 1L, 4L, 4L, 4L, NA, 1L, 5L, 4L,
NA, 4L, 4L, NA, 1L, 8L, 4L, 4L), VAR2 = c(NA, NA, 4L, 4L, 4L,
NA, NA, 4L, 5L, NA, NA, 6L, NA, NA, 12L, 4L, 4L), VAR3 = c(NA,
NA, NA, 6L, 6L, NA, NA, NA, 7L, NA, NA, NA, NA, NA, NA, 11L,
11L), VAR4 = c(NA, NA, NA, NA, 6L, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 6L), VAR5 = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_), FLAG = c(0L,
0L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L
)), .Names = c("HHID", "PERS", "MARSTAT", "SEX", "VAR1", "VAR2",
"VAR3", "VAR4", "VAR5", "FLAG"), row.names = c(NA, 17L), class = "data.frame")
For each household in my data, I want to transpose the values in the lower triangle into the upper triangle so that for each household I essentially have a symmetrical matrix with the diagonal either NA or 0 (for this analysis, 0 and NA are interchangeable). So based on the above example, I would be looking for the following dataset:
structure(list(HHID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 4L, 4L, 4L, 4L, 4L), PERS = c(1L, 2L, 3L, 4L, 5L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 1L, 2L, 3L, 4L, 5L), MARSTAT = c(2L,
2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 5L, 1L, 1L
), SEX = c(1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L,
1L, 2L, 2L, 1L), VAR1 = c(NA, 1L, 4L, 4L, 4L, NA, 1L, 5L, 4L,
NA, 4L, 4L, NA, 1L, 8L, 4L, 4L), VAR2 = c(1L, NA, 4L, 4L, 4L,
1L, NA, 4L, 5L, 4L, NA, 6L, 1L, NA, 12L, 4L, 4L), VAR3 = c(4L,
4L, NA, 6L, 6L, 5L, 4L, NA, 7L, 4L, 6L, NA, 8L, 12L, NA, 11L,
11L), VAR4 = c(4L, 4L, 6L, NA, 6L, 4L, 5L, 7L, NA, NA, NA, NA,
4L, 4L, 11L, NA, 6L), VAR5 = c(4L, 4L, 6L, 6L, NA, NA, NA, NA,
NA, NA, NA, NA, 4L, 4L, 11L, 6L, NA), FLAG = c(0L, 0L, 0L, 1L,
0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 4L, 4L, 11L, 1L, 1L)), .Names = c("HHID",
"PERS", "MARSTAT", "SEX", "VAR1", "VAR2", "VAR3", "VAR4", "VAR5",
"FLAG"), class = "data.frame", row.names = c(NA, -17L))
I have been able to do this for one household, as follows (though it misses the HHID which I would need to distinguish between households):
HH1 <- df %>%
filter(HHID == 1) %>%
select(VAR1, VAR2, VAR3, VAR4, VAR5)
HH1 <- as.matrix(HH1)
HH1[is.na(HH1)] <- 0
T_HH1 <- t(HH1)
T_HH1[is.na(T_HH1)] <- 0
combo <- HH1 + T_HH1
A <- combo
However, how would I go about doing this for multiple households across my dataset, also keeping the "HHID" and "PERS" information so that I can link on any extra info if needed?
Thank you so much in advance!
One approach is:
Split your data frame by HHID into groups
Create a custom function to take VAR columns, make it a square matrix, and transpose
Use rbindlist to reconstruct into rows again using fill to add NA as lengths in the list differ
Replace VAR columns (5 through 9) with new VAR columns
Let me know if this works for you.
f <- function(m) {
m <- m[, 1:nrow(m)]
m[upper.tri(m)] <- t(m)[upper.tri(m)]
m
}
df1[,5:9] <- rbindlist(lapply(split(df1[,5:9], df1$HHID), f), fill = TRUE)
Output
HHID PERS MARSTAT SEX VAR1 VAR2 VAR3 VAR4 VAR5 FLAG
1 1 1 2 1 NA 1 4 4 4 0
2 1 2 2 2 1 NA 4 4 4 0
3 1 3 1 2 4 4 NA 6 6 0
4 1 4 1 1 4 4 6 NA 6 1
5 1 5 1 1 4 4 6 6 NA 0
6 2 1 2 2 NA 1 5 4 NA 0
7 2 2 2 1 1 NA 4 5 NA 0
8 2 3 1 2 5 4 NA 7 NA 1
9 2 4 1 1 4 5 7 NA NA 1
10 3 1 1 2 NA 4 4 NA NA 0
11 3 2 1 2 4 NA 6 NA NA 1
12 3 3 1 1 4 6 NA NA NA 0
13 4 1 2 2 NA 1 8 4 4 0
14 4 2 2 1 1 NA 12 4 4 0
15 4 3 5 2 8 12 NA 11 11 0
16 4 4 1 2 4 4 11 NA 6 1
17 4 5 1 1 4 4 11 6 NA 1
additional solution
library(purrr)
library(tidyverse)
df %>%
mutate_all(~ replace_na(., 0)) %>%
select(HHID, starts_with("VAR")) %>%
group_by(HHID) %>%
nest %>%
mutate(data = map(data, ~ .x + t(.x))) %>%
unnest(data) %>%
bind_cols(select(df, -starts_with("VAR"), -HHID))
You can split the data on the HHID, apply an anonymous function to do the matrix stuff, then unsplit it.
vars <- grep("^VAR", names(df))
df[, vars] <- unsplit(lapply(split(df[, vars], df$HHID), tt), df$HHID)
# HHID PERS MARSTAT SEX VAR1 VAR2 VAR3 VAR4 VAR5 FLAG
# 1 1 1 2 1 0 1 4 4 4 0
# 2 1 2 2 2 1 0 4 4 4 0
# 3 1 3 1 2 4 4 0 6 6 0
# 4 1 4 1 1 4 4 6 0 6 1
# 5 1 5 1 1 4 4 6 6 0 0
# 6 2 1 2 2 0 1 5 4 0 0
# 7 2 2 2 1 1 0 4 5 0 0
# 8 2 3 1 2 5 4 0 7 0 0
# 9 2 4 1 1 4 5 7 0 0 0
# 10 3 1 1 2 0 4 4 0 0 0
# 11 3 2 1 2 4 0 6 0 0 0
# 12 3 3 1 1 4 6 0 0 0 0
# 13 4 1 2 2 0 1 8 4 4 0
# 14 4 2 2 1 1 0 12 4 4 0
# 15 4 3 5 2 8 12 0 11 11 0
# 16 4 4 1 2 4 4 11 0 6 1
# 17 4 5 1 1 4 4 11 6 0 1
Here's the anonymous function:
tt <- function(x) {
x <- x[, 1:nrow(x)] # Make it square
x[upper.tri(x)] <- 0 # replace upper triangle with 0
x + t(x) # add them together
}

Setting incomparables in place with merge

I'm seeing some unexpected behaviour with merge (or at least not entirely intuitive). But perhaps I'm just not understanding how it's supposed to work:
Let's create some dummy data to play with first:
x <- structure(list(A = c(2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L), B = c(2L, 2L, 1L, 2L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L
), C = c(2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 1L, 1L, 1L, 1L, 2L, 2L), D = c(2L, 1L, 2L, 2L, 2L, 1L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L), E = c(2L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L), F = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L), G = c(2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L),
H = c(1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 2L, 1L, 1L, 1L), I = c(1L, 1L, 2L, 2L, 2L, 1L,
1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L),
J = c(2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
2L, 2L, 2L, 2L, 1L, 2L, 1L), K = c(3, 3, 1, 3, 1, 3, 1, 2,
2, 2, 1, 3, 2, 2, 2, 1, NA, 1, 2, 1)), .Names = c("A", "B",
"C", "D", "E", "F", "G", "H", "I", "J", "K"), row.names = c(NA,
20L), class = "data.frame")
# Generate Listing of All Possible Combinations
y <- list(1:2); y = expand.grid(rep(y,10));
colnames(y) <- LETTERS[1:10]
y <- rbind(y,y,y)
y$K <- rep(1:3,each=1024)
y$mergekey <- sample(1:6,3072,replace=TRUE)
My expectation is that when I merge these two data sets that setting sort=FALSE and all.x=TRUE would provide me with a list of all x in place with mergekey.
Let's try that:
merge(x,y,all.x=TRUE,sort=FALSE)
A B C D E F G H I J K mergekey
1 2 2 2 2 2 1 2 1 1 2 3 5
2 2 2 1 1 1 1 2 2 1 1 3 3
3 2 1 2 2 1 1 2 1 2 2 1 3
4 2 2 1 2 2 1 2 2 2 2 3 2
5 1 1 2 2 2 2 2 1 2 2 1 4
6 2 1 1 1 2 2 2 2 1 2 3 6
7 1 1 1 1 2 2 2 2 1 2 1 5
8 2 1 2 2 1 1 2 2 1 1 2 4
9 2 2 2 1 1 1 2 1 2 2 2 4
10 2 1 2 2 1 1 2 1 1 1 2 2
11 2 1 2 1 1 1 2 1 2 2 1 4
12 2 2 1 2 1 2 2 1 2 1 3 5
13 2 1 2 1 1 1 2 1 2 2 2 3
14 2 1 2 1 1 1 2 1 2 2 2 3
15 2 2 2 1 2 1 2 1 2 2 2 1
16 2 1 1 2 1 1 2 2 2 2 2 1
17 2 1 1 1 1 1 2 1 1 2 1 2
18 1 2 1 1 1 2 2 1 1 1 1 5
19 2 1 2 1 1 1 2 1 1 1 1 4
20 2 2 1 2 1 1 1 2 1 2 NA NA
Now it seems that "most of x is unsorted" but incomparables are pushed to the end, rather than maintaining their order.
So, my question is: How do I get the incomparables to stay in place?
PS: Does it not seem a little unintuitive to push incomparables to the end if the merge has been told not to sort? I don't find this congruent with this behaviour either
The join function in the plyr package solves this problem intuitively without additional arguements.
library(plyr)
join(x,y)
Joining by: A, B, C, D, E, F, G, H, I, J, K
A B C D E F G H I J K mergekey
1 2 2 2 2 2 1 2 1 1 2 3 4
2 2 2 1 1 1 1 2 2 1 1 3 3
3 2 1 2 2 1 1 2 1 2 2 1 5
4 2 2 1 2 2 1 2 2 2 2 3 3
5 1 1 2 2 2 2 2 1 2 2 1 6
6 2 1 1 1 2 2 2 2 1 2 3 6
7 1 1 1 1 2 2 2 2 1 2 1 4
8 2 1 2 2 1 1 2 2 1 1 2 2
9 2 2 2 1 1 1 2 1 2 2 2 4
10 2 1 2 2 1 1 2 1 1 1 2 6
11 2 1 2 1 1 1 2 1 2 2 1 1
12 2 2 1 2 1 2 2 1 2 1 3 3
13 2 1 2 1 1 1 2 1 2 2 2 2
14 2 2 2 1 2 1 2 1 2 2 2 6
15 2 1 1 2 1 1 2 2 2 2 2 2
16 2 1 1 1 1 1 2 1 1 2 1 3
17 2 2 1 2 1 1 1 2 1 2 NA NA
18 1 2 1 1 1 2 2 1 1 1 1 1
19 2 1 2 1 1 1 2 1 2 2 2 2
20 2 1 2 1 1 1 2 1 1 1 1 1

Resources