Compare and identify the missing rows - r

I would like to compare per row 2 df based on serial and day variables and to create a new column called compare to highlight the missing rows. How can this be done in R? I tried the inner_join function without success.
Sample structure df1 and df2
Desired output:
Sample data
df1<-structure(list(serial = c(1, 2, 3, 4, 5), day = c(1, 0, 1, 0,
0)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"), row.names = c(NA,
-5L), spec = structure(list(cols = list(serial = structure(list(), class = c("collector_double",
"collector")), day = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))
df2<-structure(list(serial = c(1, 2, 3, 4, 5, 5, 7), day = c(1, 0,
1, 0, 0, 1, 1)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -7L), spec = structure(list(cols = list(
serial = structure(list(), class = c("collector_double",
"collector")), day = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))

We can use tidyverse
library(dplyr)
df2 %>%
mutate(compare = TRUE) %>%
left_join(df1 %>%
mutate(compare1 = TRUE), by = c('serial', 'day')) %>%
transmute(serial, day, compare = (!is.na(compare1)))
-output
# A tibble: 7 x 3
serial day compare
<dbl> <dbl> <lgl>
1 1 1 TRUE
2 2 0 TRUE
3 3 1 TRUE
4 4 0 TRUE
5 5 0 TRUE
6 5 1 FALSE
7 7 1 FALSE
Or with a faster and efficient data.table
library(data.table)
setDT(df2)[, compare := FALSE][setDT(df1), compare := TRUE, on = .(serial, day)]

One way would be to create a unique key combining the two columns and use %in% to find if the key is present in another dataset.
A base R option -
df2$compare <- do.call(paste, df2) %in% do.call(paste, df1)
df2
# A tibble: 7 x 3
# serial day compare
# <dbl> <dbl> <lgl>
#1 1 1 TRUE
#2 2 0 TRUE
#3 3 1 TRUE
#4 4 0 TRUE
#5 5 0 TRUE
#6 5 1 FALSE
#7 7 1 FALSE
If there are more columns in your data apart from serial and day use the below code.
cols <- c('serial', 'day')
df2$compare <- do.call(paste, df2[cols]) %in% do.call(paste, df1[cols])

A base R option
transform(
merge(cbind(df1, compare = TRUE), df2, all = TRUE),
compare = !is.na(compare)
)
gives
serial day compare
1 1 1 TRUE
2 2 0 TRUE
3 3 1 TRUE
4 4 0 TRUE
5 5 0 TRUE
6 5 1 FALSE
7 7 1 FALSE

Related

How can i LAG the previous value that meets a condition in other column (R)?

I would like to return the previous value of each row, but not the n = 1, the previous must meet a condition in other column. In this case it would be if Presence = 1.
Table with expected result
Thanks!
You could use dplyr and tidyr:
library(dplyr)
library(tidyr)
data %>%
group_by(person, indicator = cumsum(presence)) %>%
mutate(expected_lag = ifelse(presence == 0, NA, presence * result)) %>%
fill(expected_lag, .direction = "down") %>%
group_by(person) %>%
mutate(expected_lag = lag(expected_lag)) %>%
select(-indicator) %>%
ungroup()
which returns
# A tibble: 9 x 4
person presence result expected_lag
<chr> <dbl> <dbl> <dbl>
1 Ane 1 5 NA
2 Ane 0 6 5
3 Ane 0 4 5
4 Ane 1 8 5
5 Ane 1 7 8
6 John 0 9 NA
7 John 1 2 NA
8 John 0 4 2
9 John 1 3 2
Data
For simplification I removed the date column.
structure(list(person = c("Ane", "Ane", "Ane", "Ane", "Ane",
"John", "John", "John", "John"), presence = c(1, 0, 0, 1, 1,
0, 1, 0, 1), result = c(5, 6, 4, 8, 7, 9, 2, 4, 3)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -9L), spec = structure(list(
cols = list(person = structure(list(), class = c("collector_character",
"collector")), presence = structure(list(), class = c("collector_double",
"collector")), result = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))

Merge 2 data frame with respect to columns

I have 2 dataframes as shown. Can we merge with rep
df1
a b c
X a 2
X b 4
X c 1
Y a 2
Y b 1
df2
a1 c1
X 12
Y 10
Expected output (Because X and Y are top level values. Under X , we have a, b and c. Under Y, we have a and b. So we need to place them above these values.
Also, in another dataframe df2, we have values for both X and Y that need to populated into dataframe df1. Is this possible to acheive?
a b c
X 12
X a 2
X b 4
X c 1
Y 10
Y a 2
Y b 1
You could use dplyr:
library(dplyr)
df2 %>%
transmute(a = a1, b = a1, c = c1, prio = 1) %>%
bind_rows(df1 %>% mutate(prio = 2)) %>%
arrange(a, prio, b) %>%
mutate(a = ifelse(prio == 1, NA_character_, a)) %>%
select(-prio)
returns
# A tibble: 7 x 3
a b c
<chr> <chr> <dbl>
1 NA X 12
2 X a 2
3 X b 4
4 X c 1
5 NA Y 10
6 Y a 2
7 Y b 1
If you prefer an empty string over NA, just replace NA_character_ with "".
Data
df1 <- structure(list(a = c("X", "X", "X", "Y", "Y"), b = c("a", "b",
"c", "a", "b"), c = c(2, 4, 1, 2, 1)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -5L), spec = structure(list(
cols = list(a = structure(list(), class = c("collector_character",
"collector")), b = structure(list(), class = c("collector_character",
"collector")), c = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 2L), class = "col_spec"))
df2 <- structure(list(a1 = c("X", "Y"), c1 = c(12, 10)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -2L), spec = structure(list(
cols = list(a1 = structure(list(), class = c("collector_character",
"collector")), c1 = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))

Convert all coumns ending in 'ID' to character in tidyverse

I have numerous dataframes with many columns where the name of the column ends in "ID". What's the simplest way to change the type of every column ending in "ID". Ideally I'd pass the imported dataframe to a function which would return the same dataframe but with the column types changed. I definitely can't hardcode the column names as I will not know in advance what the columns are.
Here's some sample data:
test_data <- structure(list(ContactID = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
), SystemID = c(3, 1, 5, 4, 3, 5, 35, 1, 55, 52, 9), Value1 = c("A",
"B", "C", "D", "E", "F", "E", "G", "D", "S", "C"), Value2 = c("1/01/2020",
"2/01/2020", "3/01/2020", "4/01/2020", "5/01/2020", "6/01/2020",
"7/01/2020", "8/01/2020", "9/01/2020", "10/01/2020", "11/01/2020"
), OtherID = c(10004, 10009, 10002, 10007, 10099, 10010, 10002,
10004, 10002, 10007, 10099)), class = c("spec_tbl_df", "tbl_df",
"tbl", "data.frame"), row.names = c(NA, -11L), spec = structure(list(
cols = list(ContactID = structure(list(), class = c("collector_double",
"collector")), SystemID = structure(list(), class = c("collector_double",
"collector")), Value1 = structure(list(), class = c("collector_character",
"collector")), Value2 = structure(list(), class = c("collector_character",
"collector")), OtherID = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
Columns ContactID, SystemID and OtherID have been imported from a CSV file (using read_csv from readr) and so have been designated numeric. I want a function where I can pass this (or any other dataframe) to change any columns ending in ID to character.
I've tried this but it seems very clumsy. Looking for a neater solution.
change_ID_cols <- function(x) {
id_cols <- grep("ID$", colnames(x))
for (i in id_cols) {
for (j in 1:nrow(x)) {
x[j,i] <- as.character(x[j,i])
}
}
x
}
Does this work:
library(dplyr)
test_data %>% mutate(across(ends_with('ID'), as.character))
# A tibble: 11 x 5
ContactID SystemID Value1 Value2 OtherID
<chr> <chr> <chr> <chr> <chr>
1 1 3 A 1/01/2020 10004
2 2 1 B 2/01/2020 10009
3 3 5 C 3/01/2020 10002
4 4 4 D 4/01/2020 10007
5 5 3 E 5/01/2020 10099
6 6 5 F 6/01/2020 10010
7 7 35 E 7/01/2020 10002
8 8 1 G 8/01/2020 10004
9 9 55 D 9/01/2020 10002
10 10 52 S 10/01/2020 10007
11 11 9 C 11/01/2020 10099
>
You don't have to change each value individually to character. You can turn the complete column into character at once. To do this for multiple columns use lapply.
change_ID_cols <- function(x) {
id_cols <- grep("ID$", colnames(x))
x[id_cols] <- lapply(x[id_cols], as.character)
x
}
An option with data.table would be
library(data.table)
nm <- grep('ID$', names(test_data), value = TRUE)
setDT(test_data)[, (nm) := lapply(.SD, as.character), .SDcols = nm]

Fill multiple columns in a R dataframe [duplicate]

This question already has answers here:
Complete dataframe with missing combinations of values
(2 answers)
Closed 2 years ago.
I have a dataframe called flu that is a count of case(n) by group per week.
flu <- structure(list(isoweek = c(1, 1, 2, 2, 3, 3, 4, 5, 5), group = c("fluA",
"fluB", "fluA", "fluB", "fluA", "fluB", "fluA", "fluA", "fluB"
), n = c(5, 6, 3, 5, 12, 14, 6, 23, 25)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -9L), spec = structure(list(
cols = list(isoweek = structure(list(), class = c("collector_double",
"collector")), group = structure(list(), class = c("collector_character",
"collector")), n = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
In the data set there are some rows where zero cases are not reported in the data so there are no NA values to work with.
I have identified a fix for this to fill down missing weeks with zeros.
flu %>% complete(isoweek, nesting(group), fill = list(n = 0))
My problem is that this only works for the weeks of data reported. For example, at weeks 6, 7, 8 etc if there are no cases reported I have no data.
How can I extend this fill down process to extend the data frame with zeros for isoweeks 6 to 10 (for example) and have a corresponding fluA and fluB for each week with a zero value for each isoweek/group pair?
You can expand multiple columns in complete. Let's say if you need data till week 8, you can do :
tidyr::complete(flu, isoweek = 1:8, group, fill = list(n = 0))
# A tibble: 16 x 3
# isoweek group n
# <dbl> <chr> <dbl>
# 1 1 fluA 5
# 2 1 fluB 6
# 3 2 fluA 3
# 4 2 fluB 5
# 5 3 fluA 12
# 6 3 fluB 14
# 7 4 fluA 6
# 8 4 fluB 0
# 9 5 fluA 23
#10 5 fluB 25
#11 6 fluA 0
#12 6 fluB 0
#13 7 fluA 0
#14 7 fluB 0
#15 8 fluA 0
#16 8 fluB 0

Joining two dataframes to remove NaN values in the first dataframe

I would like to merge two dataframe columns.
I have df1 and that has a specific column (df$col1). This column has rows 1-100, certain rows have NA values (lets say rows 10,15,20,50,69).
Dataframe 2 has rows 10,15,20,50,69.
Is it possible to merge DF2 to df$col such that only the NA values in df$col are filled by DF2..depending on the index number for each dataset
I tried this but instead got a dataframe that did not look anything like what I want
merge(brfss2$pa1min_,df,by.x=1,by.y=1,all.x=TRUE,all.y=TRUE)
Here are the two dataframes
Dataframe1:
1 NA
2 110
3 NA
4 35
5 NA
6 120
7 280
8 30
9 240
10 260
11 322
12 NA
Dataframe 2:
1 2127.6
3 1403.0
5 198.0
12 112.8
a different method - I imported your data and gave column names:
df <- structure(list(col1 = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
), col2 = c(NA, 110, NA, 35, NA, 120, 280, 30, 240, 260, 322,
NA)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"), row.names = c(NA,
-12L), spec = structure(list(cols = list(col1 = structure(list(), class = c("collector_double",
"collector")), col2 = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 2), class = "col_spec"))
df2 <- structure(list(col1 = c(1, 3, 5, 12), col2 = c(2127.6, 1403,
198, 112.8)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -4L), spec = structure(list(cols = list(
col1 = structure(list(), class = c("collector_double", "collector"
)), col2 = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 2), class = "col_spec"))
Using tidyverse you can merge and then add a new column conditionally based on the value without NA:
library(tidyverse)
df %>%
merge(df2, by = "col1", all.x = TRUE) %>%
mutate(new_col = if_else(is.na(col2.x), col2.y, col2.x)) %>%
select(new_col)
new_col
1 2127.6
2 110.0
3 1403.0
4 35.0
5 198.0
6 120.0
7 280.0
8 30.0
9 240.0
10 260.0
11 322.0
12 112.8
I wrote the package safejoin which solves this very succinctly
# devtools::install_github("moodymudskipper/safejoin")
safe_left_join(df1,df2, by = "col1", conflict = dplyr::coalesce)
# # A tibble: 12 x 2
# col1 col2
# <dbl> <dbl>
# 1 1 2128.
# 2 2 110
# 3 3 1403
# 4 4 35
# 5 5 198
# 6 6 120
# 7 7 280
# 8 8 30
# 9 9 240
# 10 10 260
# 11 11 322
# 12 12 113.

Resources