Compare two data frames and identify duplicates based on rows - r

How to identify the duplicate rows based on serial and day variables from two separate df? I tried to create an unique variable but without success.
Desired output:
Data structure:
df1
df2
Sample data:
df1<-0)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"), row.names = c(NA,
-5L), spec = structure(list(cols = list(serial = structure(list(), class = c("collector_double",
"collector")), day = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))
df2<-structure(list(serial = c(1, 2, 3, 4, 5, 5, 7), day = c(1, 1,
1, 0, 0, 1, 1)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -7L), spec = structure(list(cols = list(
serial = structure(list(), class = c("collector_double",
"collector")), day = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))

base
df1 <- data.frame(serial = c(1:5), day = c(1, 0, 1, 0, 0))
df2 <- data.frame(serial = c(1, 2, 3, 4, 5, 5, 7), day = c(1, 1, 1, 0, 0, 1, 1))
df2$dup <- sapply(
X = paste0(df2$serial, df2$day),
FUN = function(x) !is.na(match(x = x, table = paste0(df1$serial, df1$day))))
df2
#> serial day dup
#> 1 1 1 TRUE
#> 2 2 1 FALSE
#> 3 3 1 TRUE
#> 4 4 0 TRUE
#> 5 5 0 TRUE
#> 6 5 1 FALSE
#> 7 7 1 FALSE
Created on 2021-06-30 by the reprex package (v2.0.0)

Related

Proportion calculation based on time

I have a dataset that contains measurements taken at different points in time. I would like to calculate the percentage of times a measurement in one time period is followed by the same measurement in the next time period. I want to know how often each row has the same measurement from one period to the next. How can I do this?
Sample data:
structure(list(t1 = c(1, 2, 1), t2 = c(1, 1, 1), t3 = c(1, 3,
4), t4 = c(2, 2, 2), t5 = c(3, 3, 3), t6 = c(3, 3, 3), t7 = c(1,
1, 1)), row.names = c(NA, -3L), spec = structure(list(cols = list(
t1 = structure(list(), class = c("collector_double", "collector"
)), t2 = structure(list(), class = c("collector_double",
"collector")), t3 = structure(list(), class = c("collector_double",
"collector")), t4 = structure(list(), class = c("collector_double",
"collector")), t5 = structure(list(), class = c("collector_double",
"collector")), t6 = structure(list(), class = c("collector_double",
"collector")), t7 = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), delim = ","), class = "col_spec"), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"))
To compare each time period to the previous time period, it's probably easiest to put the data in long form and compare to the lag:
library(dplyr)
library(tidyr)
timedata |>
mutate(id = row_number()) |>
pivot_longer(
-id,
names_to = "time"
) |>
group_by(id) |>
mutate(nochange = value == lag(value)) |>
group_by(time) |>
summarise(
num_repeated = sum(nochange, na.rm = TRUE),
percent_repeated = num_repeated / n() * 100
)
# A tibble: 7 x 2
# time num_repeated percent_repeated
# <chr> <int> <dbl>
# 1 t1 0 0
# 2 t2 2 66.7
# 3 t3 1 33.3
# 4 t4 0 0
# 5 t5 0 0
# 6 t6 3 100
# 7 t7 0 0
If you call your dataframe df. Then:
equal <- as.data.frame(NA)
for (i in 1:(length(df)-1)) {
for (j in 1:nrow(df)) {
equal[j,i] <- df[j,i]== df[j, i+1]
}
}
sum(equal[TRUE])*100/(nrow(df)* length(df))
Notice that this compares whether t1= t2 (no comparisons are possible in the last column because there are no 'posterior' measurements)

How can i LAG the previous value that meets a condition in other column (R)?

I would like to return the previous value of each row, but not the n = 1, the previous must meet a condition in other column. In this case it would be if Presence = 1.
Table with expected result
Thanks!
You could use dplyr and tidyr:
library(dplyr)
library(tidyr)
data %>%
group_by(person, indicator = cumsum(presence)) %>%
mutate(expected_lag = ifelse(presence == 0, NA, presence * result)) %>%
fill(expected_lag, .direction = "down") %>%
group_by(person) %>%
mutate(expected_lag = lag(expected_lag)) %>%
select(-indicator) %>%
ungroup()
which returns
# A tibble: 9 x 4
person presence result expected_lag
<chr> <dbl> <dbl> <dbl>
1 Ane 1 5 NA
2 Ane 0 6 5
3 Ane 0 4 5
4 Ane 1 8 5
5 Ane 1 7 8
6 John 0 9 NA
7 John 1 2 NA
8 John 0 4 2
9 John 1 3 2
Data
For simplification I removed the date column.
structure(list(person = c("Ane", "Ane", "Ane", "Ane", "Ane",
"John", "John", "John", "John"), presence = c(1, 0, 0, 1, 1,
0, 1, 0, 1), result = c(5, 6, 4, 8, 7, 9, 2, 4, 3)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -9L), spec = structure(list(
cols = list(person = structure(list(), class = c("collector_character",
"collector")), presence = structure(list(), class = c("collector_double",
"collector")), result = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))

Merge 2 data frame with respect to columns

I have 2 dataframes as shown. Can we merge with rep
df1
a b c
X a 2
X b 4
X c 1
Y a 2
Y b 1
df2
a1 c1
X 12
Y 10
Expected output (Because X and Y are top level values. Under X , we have a, b and c. Under Y, we have a and b. So we need to place them above these values.
Also, in another dataframe df2, we have values for both X and Y that need to populated into dataframe df1. Is this possible to acheive?
a b c
X 12
X a 2
X b 4
X c 1
Y 10
Y a 2
Y b 1
You could use dplyr:
library(dplyr)
df2 %>%
transmute(a = a1, b = a1, c = c1, prio = 1) %>%
bind_rows(df1 %>% mutate(prio = 2)) %>%
arrange(a, prio, b) %>%
mutate(a = ifelse(prio == 1, NA_character_, a)) %>%
select(-prio)
returns
# A tibble: 7 x 3
a b c
<chr> <chr> <dbl>
1 NA X 12
2 X a 2
3 X b 4
4 X c 1
5 NA Y 10
6 Y a 2
7 Y b 1
If you prefer an empty string over NA, just replace NA_character_ with "".
Data
df1 <- structure(list(a = c("X", "X", "X", "Y", "Y"), b = c("a", "b",
"c", "a", "b"), c = c(2, 4, 1, 2, 1)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -5L), spec = structure(list(
cols = list(a = structure(list(), class = c("collector_character",
"collector")), b = structure(list(), class = c("collector_character",
"collector")), c = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 2L), class = "col_spec"))
df2 <- structure(list(a1 = c("X", "Y"), c1 = c(12, 10)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -2L), spec = structure(list(
cols = list(a1 = structure(list(), class = c("collector_character",
"collector")), c1 = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))

R how to generate size classes from size data

I am attempting to generate a vector of size classes for coral size data I have collected. What I want to do is create a new column that represents size class for each individual measurement.
I want size classes to be based on every 10 units of measure. For example, if a coral is size 1-10, I want that to be size class 1, if 11-20 I want that to be size class 2, if 21-30 I want that to be size class 3 etc.
Any help to accomplish this seemingly easy task would be appreciated. Thanks!
Database
structure(list(Coral = c(1, 2, 3, 4, 5, 6), Size = c(6, 10, 12,
40, 14, 22)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -6L), spec = structure(list(cols = list(
Coral = structure(list(), class = c("collector_double", "collector"
)), Size = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
Desired Output
structure(list(Coral = c(1, 2, 3, 4, 5, 6), Size = c(6, 10, 12,
40, 14, 22), `Size Class` = c(1, 1, 2, 4, 2, 3)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -6L), spec = structure(list(
cols = list(Coral = structure(list(), class = c("collector_double",
"collector")), Size = structure(list(), class = c("collector_double",
"collector")), `Size Class` = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
You could use ceiling :
df$Size_class <- ceiling(df$Size/10)
# Coral Size Size_class
# <dbl> <dbl> <int>
#1 1 6 1
#2 2 10 1
#3 3 12 2
#4 4 40 4
#5 5 14 2
#6 6 22 3
Or findInterval
df$Size_class <- findInterval(df$Size, seq(0, max(df$Size), 10), left.open = TRUE)

Can I use the gather function in R matching 'key' and 'value' of multiple columns? [duplicate]

This question already has answers here:
Reshaping multiple sets of measurement columns (wide format) into single columns (long format)
(8 answers)
Closed 3 years ago.
I have a long dataset that looks like this:
dat <- data.frame(enterprise = c("a","b"), rev01 = c(1, 10), rev02 = c(2, 9), rev03 = c(3, 8), rev04 = c(4,7), rev05 = c(5, 6),
emp01 = c(6, 5), emp02 = c(7, 4), emp03 = c(8, 3), emp04 = c(9, 2), emp05 = c(10, 1))
Where "rev 1 to 5" is the revenue of the companies "a" and "b" in the years 1 to 5, and "emp 1 to 5" is the number of employees of these companies in the same period.
I wanted to transform this data from 'wide' to 'long' using the 'gather' function, but I don't know how to use this function to match the YEAR, the REVENUE, and the NUMBER OF EMPLOYEES.
What I wanted was something like this:
Thank you!
you can try this:
df %>%
gather(key, value, -company) %>%
separate(key, c("key", "year")) %>%
spread(key, value)
output is:
# A tibble: 10 x 4
company year emp rev
<chr> <chr> <dbl> <dbl>
1 a 1 6 1
2 a 2 7 2
3 a 3 8 3
4 a 4 9 4
5 a 5 10 5
6 b 1 5 10
7 b 2 4 9
8 b 3 3 8
9 b 4 2 7
10 b 5 1 6
I used this data:
structure(list(company = c("a", "b"), `rev 1` = c(1, 10), `rev 2` = c(2,
9), `rev 3` = c(3, 8), `rev 4` = c(4, 7), `rev 5` = c(5, 6),
`emp 1` = c(6, 5), `emp 2` = c(7, 4), `emp 3` = c(8, 3),
`emp 4` = c(9, 2), `emp 5` = c(10, 1)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -2L), spec = structure(list(
cols = list(company = structure(list(), class = c("collector_character",
"collector")), `rev 1` = structure(list(), class = c("collector_double",
"collector")), `rev 2` = structure(list(), class = c("collector_double",
"collector")), `rev 3` = structure(list(), class = c("collector_double",
"collector")), `rev 4` = structure(list(), class = c("collector_double",
"collector")), `rev 5` = structure(list(), class = c("collector_double",
"collector")), `emp 1` = structure(list(), class = c("collector_double",
"collector")), `emp 2` = structure(list(), class = c("collector_double",
"collector")), `emp 3` = structure(list(), class = c("collector_double",
"collector")), `emp 4` = structure(list(), class = c("collector_double",
"collector")), `emp 5` = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))

Resources