Proportion calculation based on time - r

I have a dataset that contains measurements taken at different points in time. I would like to calculate the percentage of times a measurement in one time period is followed by the same measurement in the next time period. I want to know how often each row has the same measurement from one period to the next. How can I do this?
Sample data:
structure(list(t1 = c(1, 2, 1), t2 = c(1, 1, 1), t3 = c(1, 3,
4), t4 = c(2, 2, 2), t5 = c(3, 3, 3), t6 = c(3, 3, 3), t7 = c(1,
1, 1)), row.names = c(NA, -3L), spec = structure(list(cols = list(
t1 = structure(list(), class = c("collector_double", "collector"
)), t2 = structure(list(), class = c("collector_double",
"collector")), t3 = structure(list(), class = c("collector_double",
"collector")), t4 = structure(list(), class = c("collector_double",
"collector")), t5 = structure(list(), class = c("collector_double",
"collector")), t6 = structure(list(), class = c("collector_double",
"collector")), t7 = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), delim = ","), class = "col_spec"), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"))

To compare each time period to the previous time period, it's probably easiest to put the data in long form and compare to the lag:
library(dplyr)
library(tidyr)
timedata |>
mutate(id = row_number()) |>
pivot_longer(
-id,
names_to = "time"
) |>
group_by(id) |>
mutate(nochange = value == lag(value)) |>
group_by(time) |>
summarise(
num_repeated = sum(nochange, na.rm = TRUE),
percent_repeated = num_repeated / n() * 100
)
# A tibble: 7 x 2
# time num_repeated percent_repeated
# <chr> <int> <dbl>
# 1 t1 0 0
# 2 t2 2 66.7
# 3 t3 1 33.3
# 4 t4 0 0
# 5 t5 0 0
# 6 t6 3 100
# 7 t7 0 0

If you call your dataframe df. Then:
equal <- as.data.frame(NA)
for (i in 1:(length(df)-1)) {
for (j in 1:nrow(df)) {
equal[j,i] <- df[j,i]== df[j, i+1]
}
}
sum(equal[TRUE])*100/(nrow(df)* length(df))
Notice that this compares whether t1= t2 (no comparisons are possible in the last column because there are no 'posterior' measurements)

Related

Compare two data frames and identify duplicates based on rows

How to identify the duplicate rows based on serial and day variables from two separate df? I tried to create an unique variable but without success.
Desired output:
Data structure:
df1
df2
Sample data:
df1<-0)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"), row.names = c(NA,
-5L), spec = structure(list(cols = list(serial = structure(list(), class = c("collector_double",
"collector")), day = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))
df2<-structure(list(serial = c(1, 2, 3, 4, 5, 5, 7), day = c(1, 1,
1, 0, 0, 1, 1)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -7L), spec = structure(list(cols = list(
serial = structure(list(), class = c("collector_double",
"collector")), day = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))
base
df1 <- data.frame(serial = c(1:5), day = c(1, 0, 1, 0, 0))
df2 <- data.frame(serial = c(1, 2, 3, 4, 5, 5, 7), day = c(1, 1, 1, 0, 0, 1, 1))
df2$dup <- sapply(
X = paste0(df2$serial, df2$day),
FUN = function(x) !is.na(match(x = x, table = paste0(df1$serial, df1$day))))
df2
#> serial day dup
#> 1 1 1 TRUE
#> 2 2 1 FALSE
#> 3 3 1 TRUE
#> 4 4 0 TRUE
#> 5 5 0 TRUE
#> 6 5 1 FALSE
#> 7 7 1 FALSE
Created on 2021-06-30 by the reprex package (v2.0.0)

Compare and identify the missing rows

I would like to compare per row 2 df based on serial and day variables and to create a new column called compare to highlight the missing rows. How can this be done in R? I tried the inner_join function without success.
Sample structure df1 and df2
Desired output:
Sample data
df1<-structure(list(serial = c(1, 2, 3, 4, 5), day = c(1, 0, 1, 0,
0)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"), row.names = c(NA,
-5L), spec = structure(list(cols = list(serial = structure(list(), class = c("collector_double",
"collector")), day = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))
df2<-structure(list(serial = c(1, 2, 3, 4, 5, 5, 7), day = c(1, 0,
1, 0, 0, 1, 1)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -7L), spec = structure(list(cols = list(
serial = structure(list(), class = c("collector_double",
"collector")), day = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))
We can use tidyverse
library(dplyr)
df2 %>%
mutate(compare = TRUE) %>%
left_join(df1 %>%
mutate(compare1 = TRUE), by = c('serial', 'day')) %>%
transmute(serial, day, compare = (!is.na(compare1)))
-output
# A tibble: 7 x 3
serial day compare
<dbl> <dbl> <lgl>
1 1 1 TRUE
2 2 0 TRUE
3 3 1 TRUE
4 4 0 TRUE
5 5 0 TRUE
6 5 1 FALSE
7 7 1 FALSE
Or with a faster and efficient data.table
library(data.table)
setDT(df2)[, compare := FALSE][setDT(df1), compare := TRUE, on = .(serial, day)]
One way would be to create a unique key combining the two columns and use %in% to find if the key is present in another dataset.
A base R option -
df2$compare <- do.call(paste, df2) %in% do.call(paste, df1)
df2
# A tibble: 7 x 3
# serial day compare
# <dbl> <dbl> <lgl>
#1 1 1 TRUE
#2 2 0 TRUE
#3 3 1 TRUE
#4 4 0 TRUE
#5 5 0 TRUE
#6 5 1 FALSE
#7 7 1 FALSE
If there are more columns in your data apart from serial and day use the below code.
cols <- c('serial', 'day')
df2$compare <- do.call(paste, df2[cols]) %in% do.call(paste, df1[cols])
A base R option
transform(
merge(cbind(df1, compare = TRUE), df2, all = TRUE),
compare = !is.na(compare)
)
gives
serial day compare
1 1 1 TRUE
2 2 0 TRUE
3 3 1 TRUE
4 4 0 TRUE
5 5 0 TRUE
6 5 1 FALSE
7 7 1 FALSE

Start and end of a period in a data frame

I'd like to return the value of the start and end of a data frame based on the data it contains. If there are only zeros than I would like to fill in the start and end column with NA.
Data structure:
Output:
Sample data:
structure(list(ID = c(1, 2, 3), A1 = c(1, 1,0), A2 = c(1, 1,0), A3 = c(0,
1,0), A4 = c(0, 1,0), A5 = c(0, 1,0)), class = c("spec_tbl_df", "tbl_df",
"tbl", "data.frame"), row.names = c(NA, -3L), spec = structure(list(
cols = list(ID = structure(list(), class = c("collector_double",
"collector")), A1 = structure(list(), class = c("collector_double",
"collector")), A2 = structure(list(), class = c("collector_double",
"collector")), A3 = structure(list(), class = c("collector_double",
"collector")), A4 = structure(list(), class = c("collector_double",
"collector")), A5 = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))
Sample code (does not work on O rows):
start <- names(df1)[-1][max.col(df1[-1], "first")]
end <- names(df1)[-1][max.col(df1[-1], "last")]
data.frame(ID = df1$ID, start, end)
Does this work:
library(dplyr)
library(tidyr)
library(stringr)
df %>% pivot_longer(-ID) %>% group_by(ID) %>%
mutate(s = cumsum(value)) %>% mutate(s = na_if(s,0)) %>%
transmute(start = str_c('A',min(s)), end = str_c('A',max(s))) %>% distinct()
# A tibble: 3 x 3
# Groups: ID [3]
ID start end
<dbl> <chr> <chr>
1 1 A1 A2
2 2 A1 A5
3 3 NA NA
Using the base functions and a for-loop, you can loop through all the rows and note the lowest and highest column that contains 1. It would, however, not note any breaks in the streak. If your streak of 1 is interrupted by a 0, this would not show in the result.
id = c()
start = c()
end = c()
for(i in 1:dim(df)[1]){
id = c(id,df$ID[i])
row = df[i,-1]
start = c(start,names(row)[min((1:length(row))[row==1])])
end = c(end,names(row)[max((1:length(row))[row==1])])
}
out = data.frame(ID=id,
start=start,
end=end)
The output is:
> out
ID start end
1 1 A1 A2
2 2 A1 A5
3 3 <NA> <NA>
library(tidyverse)
df1 %>% group_by(ID) %>% #rowwise() %>%
summarise(start = list(names(cur_data())[as.logical(cur_data())]),
end = unlist(map(start, ~last(.x))),
start = unlist(map(start, ~first(.x))),
.groups = 'drop')
#> # A tibble: 3 x 3
#> ID start end
#> <dbl> <chr> <chr>
#> 1 1 A1 A2
#> 2 2 A1 A5
#> 3 3 <NA> <NA>
Created on 2021-06-16 by the reprex package (v2.0.0)
Below a little program which might help. However, this will just work if you are sure that there are no zeros within a row of ones. Your example data and sample code suggest that.
#your data
df1 <- structure(list(ID = c(1, 2, 3), A1 = c(1, 1,0), A2 = c(1, 1,0), A3 = c(0, 1,0), A4 = c(0, 1,0), A5 = c(0, 1,0)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"), row.names = c(NA, -3L), spec = structure(list(cols = list(ID = structure(list(), class = c("collector_double", "collector")), A1 = structure(list(), class = c("collector_double", "collector")), A2 = structure(list(), class = c("collector_double", "collector")), A3 = structure(list(), class = c("collector_double", "collector")), A4 = structure(list(), class = c("collector_double", "collector")), A5 = structure(list(), class = c("collector_double", "collector"))), default = structure(list(), class = c("collector_guess", "collector")), skip = 1L), class = "col_spec"))
#use the library data.table
library(data.table)
df1 <- data.table(din)
#make a sum of by ID (by row)
df1[,sumUSE:=sum(.SD), by=ID]
#last
df1[,end:=names(df1)[(df1[,sumUSE]+1)]]
df1[end=="ID", end:=NA]
#first
df1[,start:=names(df1)[2]]
df1[is.na(end), start:=NA]
print(df1)
# ID A1 A2 A3 A4 A5 sumUSE end start
#1: 1 1 1 0 0 0 2 A2 A1
#2: 2 1 1 1 1 1 5 A5 A1
#3: 3 0 0 0 0 0 0 <NA> <NA>

Merge 2 data frame with respect to columns

I have 2 dataframes as shown. Can we merge with rep
df1
a b c
X a 2
X b 4
X c 1
Y a 2
Y b 1
df2
a1 c1
X 12
Y 10
Expected output (Because X and Y are top level values. Under X , we have a, b and c. Under Y, we have a and b. So we need to place them above these values.
Also, in another dataframe df2, we have values for both X and Y that need to populated into dataframe df1. Is this possible to acheive?
a b c
X 12
X a 2
X b 4
X c 1
Y 10
Y a 2
Y b 1
You could use dplyr:
library(dplyr)
df2 %>%
transmute(a = a1, b = a1, c = c1, prio = 1) %>%
bind_rows(df1 %>% mutate(prio = 2)) %>%
arrange(a, prio, b) %>%
mutate(a = ifelse(prio == 1, NA_character_, a)) %>%
select(-prio)
returns
# A tibble: 7 x 3
a b c
<chr> <chr> <dbl>
1 NA X 12
2 X a 2
3 X b 4
4 X c 1
5 NA Y 10
6 Y a 2
7 Y b 1
If you prefer an empty string over NA, just replace NA_character_ with "".
Data
df1 <- structure(list(a = c("X", "X", "X", "Y", "Y"), b = c("a", "b",
"c", "a", "b"), c = c(2, 4, 1, 2, 1)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -5L), spec = structure(list(
cols = list(a = structure(list(), class = c("collector_character",
"collector")), b = structure(list(), class = c("collector_character",
"collector")), c = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 2L), class = "col_spec"))
df2 <- structure(list(a1 = c("X", "Y"), c1 = c(12, 10)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -2L), spec = structure(list(
cols = list(a1 = structure(list(), class = c("collector_character",
"collector")), c1 = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))

Can I use the gather function in R matching 'key' and 'value' of multiple columns? [duplicate]

This question already has answers here:
Reshaping multiple sets of measurement columns (wide format) into single columns (long format)
(8 answers)
Closed 3 years ago.
I have a long dataset that looks like this:
dat <- data.frame(enterprise = c("a","b"), rev01 = c(1, 10), rev02 = c(2, 9), rev03 = c(3, 8), rev04 = c(4,7), rev05 = c(5, 6),
emp01 = c(6, 5), emp02 = c(7, 4), emp03 = c(8, 3), emp04 = c(9, 2), emp05 = c(10, 1))
Where "rev 1 to 5" is the revenue of the companies "a" and "b" in the years 1 to 5, and "emp 1 to 5" is the number of employees of these companies in the same period.
I wanted to transform this data from 'wide' to 'long' using the 'gather' function, but I don't know how to use this function to match the YEAR, the REVENUE, and the NUMBER OF EMPLOYEES.
What I wanted was something like this:
Thank you!
you can try this:
df %>%
gather(key, value, -company) %>%
separate(key, c("key", "year")) %>%
spread(key, value)
output is:
# A tibble: 10 x 4
company year emp rev
<chr> <chr> <dbl> <dbl>
1 a 1 6 1
2 a 2 7 2
3 a 3 8 3
4 a 4 9 4
5 a 5 10 5
6 b 1 5 10
7 b 2 4 9
8 b 3 3 8
9 b 4 2 7
10 b 5 1 6
I used this data:
structure(list(company = c("a", "b"), `rev 1` = c(1, 10), `rev 2` = c(2,
9), `rev 3` = c(3, 8), `rev 4` = c(4, 7), `rev 5` = c(5, 6),
`emp 1` = c(6, 5), `emp 2` = c(7, 4), `emp 3` = c(8, 3),
`emp 4` = c(9, 2), `emp 5` = c(10, 1)), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -2L), spec = structure(list(
cols = list(company = structure(list(), class = c("collector_character",
"collector")), `rev 1` = structure(list(), class = c("collector_double",
"collector")), `rev 2` = structure(list(), class = c("collector_double",
"collector")), `rev 3` = structure(list(), class = c("collector_double",
"collector")), `rev 4` = structure(list(), class = c("collector_double",
"collector")), `rev 5` = structure(list(), class = c("collector_double",
"collector")), `emp 1` = structure(list(), class = c("collector_double",
"collector")), `emp 2` = structure(list(), class = c("collector_double",
"collector")), `emp 3` = structure(list(), class = c("collector_double",
"collector")), `emp 4` = structure(list(), class = c("collector_double",
"collector")), `emp 5` = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))

Resources