Seeing if all values in one dataframe row exist in another dataframe - r

I have a dataframe as follows:
df1
ColA ColB ColC ColD
10 A B L
11 N Q NA
12 P J L
43 M T NA
89 O J T
df2
ATTR Att R1 R2 R3 R4
1 45 A B NA NA
2 40 C D NA NA
3 33 T J O NA
4 65 L NA NA NA
5 20 P L J NA
6 23 Q NA NA NA
7 38 Q L NA NA
How do I match up df2 with df1 so that if ALL the values in each df2 row (disregarding the order) show up in the df1 rows, then it will populate. So it is checking if ALL not just one value from each df2 row matches up with each df1 row. The final result in this case should be this:
ColA ColB ColC ColD ATTR Att R1 R2 R3 R4
10 A B L 1 45 A B NA NA
10 A B L 4 65 L NA NA NA
11 N Q NA 6 23 Q NA NA NA
12 P J L 4 65 L NA NA NA
12 P J L 5 20 P L J NA
89 O J T 3 33 T J O NA
Thanks

Here is a possible solution using base R.
Make sure everything is a character before continuing, i.e.
df[-1] <- lapply(df[-1], as.character)
df1[-c(1:2)] <- lapply(df1[-c(1:2)], as.character)
First we create two lists which contain vectors of the rowwise elements of each data frame. We then create a matrix with the length of elements from l2 are found in l1, If the length is 0 then it means they match. i.e,
l1 <- lapply(split(df[-1], seq(nrow(df))), function(i) i[!is.na(i)])
l2 <- lapply(split(df1[-c(1:2)], seq(nrow(df1))), function(i) i[!is.na(i)])
m1 <- sapply(l1, function(i) sapply(l2, function(j) length(setdiff(j, i))))
m1
# 1 2 3 4 5
#1 0 2 2 2 2
#2 2 2 2 2 2
#3 3 3 2 2 0
#4 0 1 0 1 1
#5 2 3 0 3 2
#6 1 0 1 1 1
#7 1 1 1 2 2
We then use that matrix to create a couple of coloumns in our original df. The first column rpt will indicate how many times each row has length 0 and use that as a number of repeats for each row. We also use it to filter out all the 0 lengths (i.e. the rows that do not have a match with df1). After expanding the data frame we create another variable; ATTR (same name as ATTR in df1) in order to use it for a merge. i.e.
df$rpt <- colSums(m1 == 0)
df <- df[df$rpt != 0,]
df <- df[rep(row.names(df), df$rpt),]
df$ATTR <- which(m1 == 0, arr.ind = TRUE)[,1]
df
# ColA ColB ColC ColD rpt ATTR
#1 10 A B L 2 1
#1.1 10 A B L 2 4
#2 11 N Q <NA> 1 6
#3 12 P J L 2 4
#3.1 12 P J L 2 5
#5 89 O J T 1 3
We then merge and order the two data frames,
final_df <- merge(df, df1, by = 'ATTR')
final_df[order(final_df$ColA),]
# ATTR ColA ColB ColC ColD rpt Att R1 R2 R3 R4
#1 1 10 A B L 2 45 A B <NA> <NA>
#3 4 10 A B L 2 65 L <NA> <NA> <NA>
#6 6 11 N Q <NA> 1 23 Q <NA> <NA> <NA>
#4 4 12 P J L 2 65 L <NA> <NA> <NA>
#5 5 12 P J L 2 20 P L J <NA>
#2 3 89 O J T 1 33 T J O <NA>
DATA
dput(df)
structure(list(ColA = c(10L, 11L, 12L, 43L, 89L), ColB = c("A",
"N", "P", "M", "O"), ColC = c("B", "Q", "J", "T", "J"), ColD = c("L",
NA, "L", NA, "T")), .Names = c("ColA", "ColB", "ColC", "ColD"
), row.names = c(NA, -5L), class = "data.frame")
dput(df1)
structure(list(ATTR = 1:7, Att = c(45L, 40L, 33L, 65L, 20L, 23L,
38L), R1 = c("A", "C", "T", "L", "P", "Q", "Q"), R2 = c("B",
"D", "J", NA, "L", NA, "L"), R3 = c(NA, NA, "O", NA, "J", NA,
NA), R4 = c(NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_)), .Names = c("ATTR",
"Att", "R1", "R2", "R3", "R4"), row.names = c(NA, -7L), class = "data.frame")

Related

Reshaping wide to long with every n columns

Suppose I have a dataframe:
dw <- read.table(header=T, text='
ID q1 q2 q3 q4 q5 ...q10
A 10 6 50 10 bA
B 12 5 70 11 bB
C 20 7 20 8 bC
D 22 8 22 9 bD
')
I would like to move every 2 columns after 'ID' to new rows so it looks like:
ID q1 q2
A 10 6
B 12 5
C 20 7
D 22 8
A 50 10
B 70 11
C 20 8
D 22 9
....
pivot_longer seems to move every single column instead of multiple columns?
It seems that you are not concerned with the column names (other than ID), and that they are all the same class. For this, we can "pivot" manually, without the safeguards or power of pivot_lower perhaps, but without the requirements as well.
The first step is to make sure that class won't be an issue; because you have some strings in there, we need to convert all to character:
dw[-1] <- lapply(dw[-1], as.character)
After that, we can manually extract every two (non-ID) columns and combine with ID:
cols <- seq_along(dw)[-1]
list_of_frames <- lapply(split(cols, cols %/% 2), function(ind) setNames(dw[,c(1, ind)], c("ID", "q1", "q2")))
list_of_frames
# $`1`
# ID q1 q2
# 1 A 10 6
# 2 B 12 5
# 3 C 20 7
# 4 D 22 8
# $`2`
# ID q1 q2
# 1 A 50 10
# 2 B 70 11
# 3 C 20 8
# 4 D 22 9
# $`3`
# ID q1 q2
# 1 A bA zA
# 2 B bB zB
# 3 C bC zC
# 4 D bD zD
This can be easily combined with several methods, choose one of:
data.table::rbindlist(list_of_frames)
dplyr::bind_rows(list_of_frames)
do.call(rbind, list_of_frames)
# ID q1 q2
# 1 A 10 6
# 2 B 12 5
# 3 C 20 7
# 4 D 22 8
# 5 A 50 10
# 6 B 70 11
# 7 C 20 8
# 8 D 22 9
# 9 A bA zA
# 10 B bB zB
# 11 C bC zC
# 12 D bD zD
Data
dw <- structure(list(ID = c("A", "B", "C", "D"), q1 = c("10", "12", "20", "22"), q2 = c("6", "5", "7", "8"), q3 = c("50", "70", "20", "22"), q4 = c("10", "11", "8", "9"), q5 = c("bA", "bB", "bC", "bD"), q6 = c("zA", "zB", "zC", "zD")), row.names = c(NA, -4L), class = "data.frame")
Another option:
data.frame(ID = dw$ID,
q1 = unlist(dw[,seq(2, ncol(dw), 2)], use.names = FALSE),
q2 = unlist(dw[,seq(3, ncol(dw), 2)], use.names = FALSE))
With data:
dw <- structure(list(ID = c("A", "B", "C", "D"),
q1 = c(10L, 12L, 20L, 22L),
q2 = c(6L, 5L, 7L, 8L),
q3 = c(50L, 70L, 20L, 22L),
q4 = c(10L, 11L, 8L, 9L),
q5 = c("bA", "bB", "bC", "bD"),
q6 = c("cc", "dd", "ee", "ff"))
, class = "data.frame", row.names = c(NA, -4L))
data.frame(ID = dw$ID,
q1 = unlist(dw[,seq(2, ncol(dw), 2)], use.names = FALSE),
q2 = unlist(dw[,seq(3, ncol(dw), 2)], use.names = FALSE))
#> ID q1 q2
#> 1 A 10 6
#> 2 B 12 5
#> 3 C 20 7
#> 4 D 22 8
#> 5 A 50 10
#> 6 B 70 11
#> 7 C 20 8
#> 8 D 22 9
#> 9 A bA cc
#> 10 B bB dd
#> 11 C bC ee
#> 12 D bD ff
Or more generally:
n <- 3L # operate on every 3 columns
data.frame(
setNames(
c(
list(dw[,1]),
lapply(
2:(n + 1L),
function(i) unlist(dw[,seq(i, ncol(dw), n)], TRUE, FALSE)
)
),
names(dw)[1:(n + 1L)]
)
)
#> ID q1 q2 q3
#> 1 A 10 6 50
#> 2 B 12 5 70
#> 3 C 20 7 20
#> 4 D 22 8 22
#> 5 A 10 bA cc
#> 6 B 11 bB dd
#> 7 C 8 bC ee
#> 8 D 9 bD ff
The melt(...) method for data.table allows for melting groups of columns. Using dw from #r2evans answer:
library(data.table)
setDT(dw)
result <- melt(dw, measure.vars = list(seq(2, ncol(dw), 2), seq(3, ncol(dw), 2)))
result[, variable:=NULL]
result
## ID value1 value2
## 1: A 10 6
## 2: B 12 5
## 3: C 20 7
## 4: D 22 8
## 5: A 50 10
## 6: B 70 11
## 7: C 20 8
## 8: D 22 9
## 9: A bA zA
## 10: B bB zB
## 11: C bC zC
## 12: D bD zD
melt(...) introduces a column variable which keeps track of the location of the original columns in the wide dataset. You don't seem to care about that so it's removed. If there are indeed different classes (integer, character) melt(...) will take care of that with a warning.

merge multiple columns in one table?

I have a table with several columns, I would like to make a column by combining 'R1,R2 and R3' columns in a table.
DF:
ID R1 T1 R2 T2 R3 T3
rs1 A 1 NA . NA 0
rs21 NA 0 C 1 C 1
rs32 A 1 A 1 A 0
rs25 NA 2 NA 0 A 0
Desired output:
ID R1 T1 R2 T2 R3 T3 New_R
rs1 A 1 NA . NA 0 A
rs21 NA 0 C 1 C 1 C
rs32 A 1 A 1 A 0 A
rs25 NA 2 NA 0 A 0 A
We can use tidyverse
library(tidyverse)
DF %>%
mutate(New_R = pmap_chr(select(., starts_with("R")), ~c(...) %>%
na.omit %>%
unique %>%
str_c(collape="")))
#. ID R1 T1 R2 T2 R3 T3 New_R
#1 rs1 A 1 <NA> . <NA> 0 A
#2 rs21 <NA> 0 C 1 C 1 C
#3 rs32 A 1 A 1 A 0 A
#4 rs25 <NA> 2 <NA> 0 A 0 A
If there is only one non-NA element per row, we can use coalecse
DF %>%
mutate(New_R = coalesce(!!! select(., starts_with("R"))))
Or in base R
DF$New_R <- do.call(pmin, c(DF[grep("^R\\d+", names(DF))], na.rm = TRUE))
data
DF <- structure(list(ID = c("rs1", "rs21", "rs32", "rs25"), R1 = c("A",
NA, "A", NA), T1 = c(1L, 0L, 1L, 2L), R2 = c(NA, "C", "A", NA
), T2 = c(".", "1", "1", "0"), R3 = c(NA, "C", "A", "A"), T3 = c(0L,
1L, 0L, 0L)), class = "data.frame", row.names = c(NA, -4L))
you can use the ifelse function in a nested way:
DF$New_R <- ifelse(!is.na(DF$R1), DF$R1,
ifelse(!is.na(DF$R2), DF$R2,
ifelse(!is.na(DF$R3), DF$R3, NA)))
ifelse takes three arguments, a condition, what to do if the condition is fulfilled, and what to do if the condition is not fulfilled. It can be applied to data frame column treating each raw separately. In my example it will pick the first non NA value found.
We can use apply row-wise, remove NA values and keeping only unique values.
cols <- paste0("R", 1:3)
df$New_R <- apply(df[cols], 1, function(x)
paste0(unique(na.omit(x)), collapse = ""))
df
# ID R1 T1 R2 T2 R3 T3 New_R
#1 rs1 A 1 <NA> . <NA> 0 A
#2 rs21 <NA> 0 C 1 C 1 C
#3 rs32 A 1 A 1 A 0 A
#4 rs25 <NA> 2 <NA> 0 A 0 A

Remove NA in front of one specific string but leave in front of another specific string, by group

I have this data frame:
df <- data.frame(
id = rep(1:4, each = 4),
status = c(
NA, "a", "c", "a",
NA, "b", "c", "c",
NA, NA, "a", "c",
NA, NA, "b", "b"),
stringsAsFactors = FALSE)
For each group (id), I aim to remove the rows with one or multiple leading NA in front of an "a" (in the column "status") but not in front of a "b".
The final data frame should look like this:
structure(list(
id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 4L, 4L, 4L, 4L),
status = c("a", "c", "a", NA, "b", "c", "c", "a", "c", NA, NA, "b", "b")),
.Names = c("id", "status"), row.names = c(NA, -13L), class = "data.frame")
How do I do that?
Edit: alternatively, how would I do it to preserve other variables in the data frame such as the variable otherVar in the following example:
df2 <- data.frame(
id = rep(1:4, each = 4),
status = c(
NA, "a", "c", "a",
NA, "b", "c", "c",
NA, NA, "a", "c",
NA, NA, "b", "b"),
otherVar = letters[1:16],
stringsAsFactors = FALSE)
We can group by 'id', summarise the 'status' by pasteing the elements together, then use gsub to remove the NA before the 'a' and convert it to 'long' format with separate_rows
library(dplyr)
library(tidyr)
df %>%
group_by(id) %>%
summarise(status = gsub("(NA, ){1,}(?=a)", "", toString(status),
perl = TRUE)) %>%
separate_rows(status, convert = TRUE)
# A tibble: 13 x 2
# id status
# <int> <chr>
# 1 1 a
# 2 1 c
# 3 1 a
# 4 2 NA
# 5 2 b
# 6 2 c
# 7 2 c
# 8 3 a
# 9 3 c
#10 4 NA
#11 4 NA
#12 4 b
#13 4 b
Or using data.table with the same methodology
library(data.table)
out1 <- setDT(df)[, strsplit(gsub("(NA, ){1,}(?=a)", "",
toString(status), perl = TRUE), ", "), id]
setnames(out1, 'V1', "status")[]
# id status
# 1: 1 a
# 2: 1 c
# 3: 1 a
# 4: 2 NA
# 5: 2 b
# 6: 2 c
# 7: 2 c
# 8: 3 a
# 9: 3 c
#10: 4 NA
#11: 4 NA
#12: 4 b
#13: 4 b
Update
For the updated dataset 'df2'
i1 <- setDT(df2)[, .I[seq(which(c(diff((status %in% "a") +
rleid(is.na(status))) > 1), FALSE))] , id]$V1
df2[-i1]
# id status otherVar
# 1: 1 a b
# 2: 1 c c
# 3: 1 a d
# 4: 2 NA e
# 5: 2 b f
# 6: 2 c g
# 7: 2 c h
# 8: 3 a k
# 9: 3 c l
#10: 4 NA m
#11: 4 NA n
#12: 4 b o
#13: 4 b p
From zoo with na.locf and is.na, notice it assuming you data is ordered.
df[!(na.locf(df$status,fromLast = T)=='a'&is.na(df$status)),]
id status
2 1 a
3 1 c
4 1 a
5 2 <NA>
6 2 b
7 2 c
8 2 c
11 3 a
12 3 c
13 4 <NA>
14 4 <NA>
15 4 b
16 4 b
Here's a dplyr solution and a not as pretty base translation :
dplyr
library(dplyr)
df %>% group_by(id) %>%
filter(status[!is.na(status)][1]!="a" | !is.na(status))
# # A tibble: 13 x 2
# # Groups: id [4]
# id status
# <int> <chr>
# 1 1 a
# 2 1 c
# 3 1 a
# 4 2 <NA>
# 5 2 b
# 6 2 c
# 7 2 c
# 8 3 a
# 9 3 c
# 10 4 <NA>
# 11 4 <NA>
# 12 4 b
# 13 4 b
base
do.call(rbind,
lapply(split(df,df$id),
function(x) x[x$status[!is.na(x$status)][1]!="a" | !is.na(x$status),]))
# id status
# 1.2 1 a
# 1.3 1 c
# 1.4 1 a
# 2.5 2 <NA>
# 2.6 2 b
# 2.7 2 c
# 2.8 2 c
# 3.11 3 a
# 3.12 3 c
# 4.13 4 <NA>
# 4.14 4 <NA>
# 4.15 4 b
# 4.16 4 b
note
Will fail if not all NAs are leading because will remove all NAs from groups starting with "a" as a first non NA value.

R split each row of a dataframe into two rows

I would like to splite each row of a data frame(numberic) into two rows. For example, part of the original data frame like this (nrow(original datafram) > 2800000):
ID X Y Z value_1 value_2
1 3 2 6 22 54
6 11 5 9 52 71
3 7 2 5 2 34
5 10 7 1 23 47
And after spliting each row, we can get:
ID X Y Z
1 3 2 6
22 54 NA NA
6 11 5 9
52 71 NA NA
3 7 2 5
2 34 NA NA
5 10 7 1
23 47 NA NA
the "value_1" and "value_2" columns are split and each element is set to a new row. For example, value_1 = 22 and value_2 = 54 are set to a new row.
Here is one option with data.table. We convert the 'data.frame' to 'data.table' by creating a column of rownames (setDT(df1, keep.rownames = TRUE)). Subset the columns 1:5 and 1, 6, 7 in a list, rbind the list element with fill = TRUE option to return NA for corresponding columns that are not found in one of the datasets, order by the row number ('rn') and assign (:=) the row number column to 'NULL'.
library(data.table)
setDT(df1, keep.rownames = TRUE)[]
rbindlist(list(df1[, 1:5, with = FALSE], setnames(df1[, c(1, 6:7),
with = FALSE], 2:3, c("ID", "X"))), fill = TRUE)[order(rn)][, rn:= NULL][]
# ID X Y Z
#1: 1 3 2 6
#2: 22 54 NA NA
#3: 6 11 5 9
#4: 52 71 NA NA
#5: 3 7 2 5
#6: 2 34 NA NA
#7: 5 10 7 1
#8: 23 47 NA NA
A hadleyverse corresponding to the above logic would be
library(dplyr)
tibble::rownames_to_column(df1[1:4]) %>%
bind_rows(., setNames(tibble::rownames_to_column(df1[5:6]),
c("rowname", "ID", "X"))) %>%
arrange(rowname) %>%
select(-rowname)
# ID X Y Z
#1 1 3 2 6
#2 22 54 NA NA
#3 6 11 5 9
#4 52 71 NA NA
#5 3 7 2 5
#6 2 34 NA NA
#7 5 10 7 1
#8 23 47 NA NA
data
df1 <- structure(list(ID = c(1L, 6L, 3L, 5L), X = c(3L, 11L, 7L, 10L
), Y = c(2L, 5L, 2L, 7L), Z = c(6L, 9L, 5L, 1L), value_1 = c(22L,
52L, 2L, 23L), value_2 = c(54L, 71L, 34L, 47L)), .Names = c("ID",
"X", "Y", "Z", "value_1", "value_2"), class = "data.frame",
row.names = c(NA, -4L))
Here's a (very slow) pure R solution using no extra packages:
# Replicate your matrix
input_df <- data.frame(ID = rnorm(10000),
X = rnorm(10000),
Y = rnorm(10000),
Z = rnorm(10000),
value_1 = rnorm(10000),
value_2 = rnorm(10000))
# Preallocate memory to a data frame
output_df <- data.frame(
matrix(
nrow = nrow(input_df)*2,
ncol = ncol(input_df)-2))
# Loop through each row in turn.
# Put the first four elements into the current
# row, and the next two into the current+1 row
# with two NAs attached.
for(i in seq(1, nrow(output_df), 2)){
output_df[i,] <- input_df[i, c(1:4)]
output_df[i+1,] <- c(input_df[i, c(5:6)],NA,NA)
}
colnames(output_df) <- c("ID", "X", "Y", "Z")
Which results in
> head(output_df)
X1 X2 X3 X4
1 0.5529417 -0.93859275 2.0900276 -2.4023800
2 0.9751090 0.13357075 NA NA
3 0.6753835 0.07018647 0.8529300 -0.9844643
4 1.6405939 0.96133195 NA NA
5 0.3378821 -0.44612782 -0.8176745 0.2759752
6 -0.8910678 -0.37928353 NA NA
This should work
data <- read.table(text= "ID X Y Z value_1 value_2
1 3 2 6 22 54
6 11 5 9 52 71
3 7 2 5 2 34
5 10 7 1 23 47", header=T)
data1 <- data[,1:4]
data2 <- setdiff(data,data1)
names(data2) <- names(data1)[1:ncol(data2)]
combined <- plyr::rbind.fill(data1,data2)
n <- nrow(data1)
combined[kronecker(1:n, c(0, n), "+"),]
Though why you would need to do this beats me.

Replace value in data.frame with value in next column

I have dataframe with two columns:
names duration
1 J 97
2 G NA
3 H 53
4 A 23
5 E NA
6 D NA
7 C 73
8 F NA
9 B 37
10 I 67
What I want to do is replace all NA values in duration column with value from names column from the same row. How can I achive that?
Data
zz <- "names duration
1 J 97
2 G NA
3 H 53
4 A 23
5 E NA
6 D NA
7 C 73
8 F NA
9 B 37
10 I 67"
df <- read.table(text = zz, header = TRUE)
Solution with dplyr
library(dplyr)
df_new <- df %>%
mutate(duration = ifelse(is.na(duration), as.character(names), duration))
Output
df_new
# names duration
# 1 J 97
# 2 G G
# 3 H 53
# 4 A 23
# 5 E E
# 6 D D
# 7 C 73
# 8 F F
# 9 B 37
# 10 I 67
We can use is.na to create a logical index and then subset both the 'names' based on the 'i1' to replace the 'duration' on the same row.
i1 <- is.na(df$duration)
df$duration[i1] <- df$names[i1]
df
# names duration
#1 J 97
#2 G G
#3 H 53
#4 A 23
#5 E E
#6 D D
#7 C 73
#8 F F
#9 B 37
#10 I 67
NOTE: This should change the class of the 'duration' to character from numeric
Or this can be done with a faster approach with data.table. Convert the 'data.frame' to 'data.table' (setDT(df)), change the class of 'duration' to character, then by specifying the condition in 'i' (is.na(duration)), we assign (:=) the values in 'name' that correspond to the 'i' condition to 'duration'. As the assignment happens in place, it will be very efficient.
library(data.table)
setDT(df)[, duration:= as.character(duration)][is.na(duration), duration:= names]
data
df <- structure(list(names = c("J", "G", "H", "A", "E", "D", "C", "F",
"B", "I"), duration = c(97L, NA, 53L, 23L, NA, NA, 73L, NA, 37L,
67L)), .Names = c("names", "duration"), row.names = c("1", "2",
"3", "4", "5", "6", "7", "8", "9", "10"), class = "data.frame")

Resources