Get difference with previous value and with next value - r

I have a dataframe like this. A small sample actually the df is bigger:
LOW 1 4 NA
MID 3 4 4
HIG 2 5 4
And would like to get the difference for LOW and HIG with MID so the ending df would be like this:
LOW 2 0 NA
MID 3 4 4
HIG 1 1 0
So you're getting: LOW = 3 - 1 = 2 and HIG = 3 - 2 = 1. I cand do it via VBA macros but want to scale with R.

It can be done with mutate_if/mutate_at
library(dplyr)
df1 %>%
mutate_if(is.numeric, ~ case_when(grp != 'MID' ~
abs(. - .[grp == 'MID']), TRUE ~ .))
# grp v1 v2 v3
#1 LOW 2 0 NA
#2 MID 3 4 4
#3 HIG 1 1 0
Or in base R
i1 <- df1$grp == 'MID'
df1[!i1, -1] <- abs(df1[!i1, -1] - rep(unlist(df1[i1, -1]), each = sum(!i1)))
data
df1 <- structure(list(grp = c("LOW", "MID", "HIG"), v1 = c(1L, 3L, 2L
), v2 = c(4L, 4L, 5L), v3 = c(NA, 4L, 4L)), class = "data.frame", row.names = c(NA,
-3L))

You can change the 'LOW', 'HIG' rows after subtracting by 'MID' :
df1[df1$grp == 'LOW', -1] <- abs(df1[df1$grp == 'MID',-1]- df1[df1$grp == 'LOW',-1])
df1[df1$grp == 'HIG', -1] <- abs(df1[df1$grp == 'MID',-1]- df1[df1$grp == 'HIG',-1])
df1
# grp v1 v2 v3
#1 LOW 2 0 NA
#2 MID 3 4 4
#3 HIG 1 1 0

Related

How to group by a variable and see if they have another observation within a given time frame, R

I have the something like the following:
person_ID visit date
1 2/25/2001
1 2/30/2001
1 4/2/2001
2 3/18/2004
3 9/22/2004
3 10/27/2004
3 5/15/2008
I want to add another column to see if the person has a reoccurring observation within 90 days, like:
person_ID visit date reoccurrence
1 2/25/2001 1
1 2/30/2001 1
1 4/2/2001 0
2 3/18/2004 0
3 9/22/2004 1
3 10/27/2004 0
3 5/15/2008 0
any help is appreciated, thank you!
If the second 'date' is not 2/30/2001, convert the 'visit_date' to Date class, grouped by 'person_id', get the difference between current and next 'visit_date' in 'day', check if it is less than 90, replace the NA with 0
library(dplyr)
library(lubridate)
library(tidyr)
df1 <- df1 %>%
mutate(visit_date = mdy(visit_date)) %>%
group_by(person_ID) %>%
mutate(reoccurrence = replace_na(+(difftime(lead(visit_date),
visit_date, units = 'day') < 90), 0)) %>%
ungroup
-output
# A tibble: 7 x 3
# person_ID visit_date reoccurrence
# <int> <date> <dbl>
#1 1 2001-02-25 1
#2 1 2001-02-28 1
#3 1 2001-04-02 0
#4 2 2004-03-18 0
#5 3 2004-09-22 1
#6 3 2004-10-27 0
#7 3 2008-05-15 0
Or using data.table
library(data.table)
setDT(df1)[, visit_date := as.IDate(visit_date, '%m/%d/%Y')
][, reoccurence := +(difftime(shift(visit_date, type = 'lead'),
visit_date, units = 'day') < 90))
][is.na(reoccurence), reoccurence := 0]
Or with base R
df1$visit_date <- as.Date(df1$visit_date, '%m/%d/%Y')
with(df1, ave(as.integer(visit_date), person_ID, FUN =
function(x) c(+(diff(x) < 90), 0)))
#[1] 1 1 0 0 1 0 0
data
df1 <- structure(list(person_ID = c(1L, 1L, 1L, 2L, 3L, 3L, 3L), visit_date = c("2/25/2001",
"2/28/2001", "4/2/2001", "3/18/2004", "9/22/2004", "10/27/2004",
"5/15/2008")), row.names = c(NA, -7L), class = "data.frame")
Base R variant:
reoccur <- function(x, lim=90) {
m <- outer(x, x, `-`)
m[upper.tri(m, diag=TRUE)] <- NA
colSums(!is.na(m) & m >= 0 & m <= lim) > 0
}
### make your dates *dates*
dat$visit <- as.Date(dat$visit, format="%m/%d/%Y")
### calculate if you have reoccurrences
ave(as.numeric(dat$visit), dat$person_ID, FUN=reoccur)
# [1] 1 1 0 0 1 0 0
Data:
dat <- structure(list(person_ID = c(1L, 1L, 1L, 2L, 3L, 3L, 3L), visit = c("2/25/2001", "2/27/2001", "4/2/2001", "3/18/2004", "9/22/2004", "10/27/2004", "5/15/2008")), class = "data.frame", row.names = c(NA, -7L))
(I changed "2/30/2001" to "2/27/2001" to get a real Date out of it.)

How to get merged data frame from two data frames having some same columns(R)

I want to merge them and find the values of one dataframe that would like to be added to the existing values of the other based on the same columns.
For example:
df1
No
A
B
C
D
1
1
0
1
0
2
0
1
2
1
3
0
0
1
0
df2
No
A
B
E
F
1
1
0
1
1
2
0
1
2
1
3
2
1
1
0
Finally, I want the output table like this.
df
No
A
B
C
D
E
F
1
2
0
1
0
1
1
2
0
2
2
1
2
1
3
2
1
1
0
1
0
Note: I did try merge(), but in this case, it did not work.
Any help/suggestion would be appreciated.
Reproducible sample data
df1 <-
structure(list(No = 1:3, A = c(1L, 0L, 0L), B = c(0L, 1L, 0L),
C = c(1L, 2L, 1L), D = c(0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-3L))
df2 <-
structure(list(No = 1:3, A = c(1L, 0L, 2L), B = c(0L, 1L, 1L),
E = c(1L, 2L, 1L), F = c(1L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-3L))
You can also carry out this operation by left_joining these two data frames:
library(dplyr)
library(stringr)
df1 %>%
left_join(df2, by = "No") %>%
mutate(across(ends_with(".x"), ~ .x + get(str_replace(cur_column(), "\\.x", "\\.y")))) %>%
rename_with(~ str_replace(., "\\.x", ""), ends_with(".x")) %>%
select(!ends_with(".y"))
No A B C D E F
1 1 2 0 1 0 1 1
2 2 0 2 2 1 2 1
3 3 2 1 1 0 1 0
You can first row-bind the two dataframes and then compute the sum of each column while 'grouping' by the No column. This can be done like so:
library(dplyr)
bind_rows(df1, df2) %>%
group_by(No) %>%
summarise(across(c(A, B, C, D, E, `F`), sum, na.rm = TRUE),
.groups = "drop")
If a particular column doesn't exist in one dataframe (i.e. columns E and F), values will be padded with NA. Adding the na.rm = TRUE argument (to be passed to sum()) means that these values will get treated like zeros.
Using data.table :
library(data.table)
rbindlist(list(df1, df2), fill = TRUE)[, lapply(.SD, sum, na.rm = TRUE), No]
# No A B C D E F
#1: 1 2 0 1 0 1 1
#2: 2 0 2 2 1 2 1
#3: 3 2 1 1 0 1 0
We can use base R (with R 4.1.0). Get the values of the objects in a list ('lst1'). Then, find the union of the column names ('nm1'). Loop over the list assign to create 0 value columns with setdiff in each list element, rbind them and use aggregate to get the sum grouped by 'No'
lst1 <- mget(ls(pattern= '^df\\d+$'))
nm1 <- lapply(lst1, names) |>
{\(x) Reduce(union, x)}()
lapply(lst1, \(x) {x[setdiff(nm1, names(x))] <- 0; x}) |>
{\(x) do.call(rbind, x)}() |>
{\(dat) aggregate(.~ No, data = dat, FUN = sum, na.rm = TRUE,
na.action = na.pass)}()
# No A B C D E F
#1 1 2 0 1 0 1 1
#2 2 0 2 2 1 2 1
#3 3 2 1 1 0 1 0

Creating duplicated data frames with different ID

I have a question for the community and hoping for some help.
I am trying to duplicate a data frame like the one below:
ID Time Solve
1 0 1
1 2 2
1 4 3
1 6 1
I am trying to duplicate the above data frame 100 times so, it would read as below:
ID Time Solve
1 0 1
1 2 2
1 4 3
1 6 1
2 0 1
2 2 2
2 4 3
2 6 1
3 0 1
3 2 2
3 4 3
3 6 1
4 0 1
4 2 2
4 4 3
4 6 1
.....
100 0 1
100 2 2
100 4 3
100 6 1
Does anyone have a good solution for this or a resource to read up on this?
Thanks!
We can use replicate
out <- do.call(rbind, replicate(100, df1, simplify = FALSE))
out$ID <- as.integer(gl(nrow(out), nrow(df1), nrow(out)))
Or another option is rep
out <- df1[rep(seq_len(nrow(df1)), 100),]
out$ID <- as.integer(gl(nrow(out), nrow(df1), nrow(out)))
Or make use of uncount
library(tidyr)
library(dplyr)
uncount(df1, 100) %>%
mutate(ID = as.integer(gl(n(), nrow(df1), n()))
Or another option is
df1 %>%
nest_by(ID) %>%
uncount(100) %>%
mutate(ID = row_number()) %>%
unnest(c(data))
data
df1 <- structure(list(ID = c(1L, 1L, 1L, 1L), Time = c(0L, 2L, 4L, 6L
), Solve = c(1L, 2L, 3L, 1L)), class = "data.frame", row.names = c(NA,
-4L))

Repeat a value within each ID

I have a dataset in R in long format. Each ID does not appear the same number of times (i.e. one ID might be one row, another might appear 79 rows).
e.g.
ID V1 V2
1 B 0
1 A 1
1 C 0
2 C 0
3 A 0
3 C 0
I want to create a variable which, if any of the rows for a given ID have Var2 == 1, then 1 repeats for every row of that ID
e.g.
ID V1 V2 V3
1 B 0 1
1 A 1 1
1 C 0 1
2 C 0 0
3 A 0 0
3 C 0 0
In base R we can use any - and ave for the grouping.
DF$V3 <- with(DF, ave(V2, ID, FUN = function(x) any(x == 1)))
DF
# ID V1 V2 V3
#1 1 B 0 1
#2 1 A 1 1
#3 1 C 0 1
#4 2 C 0 0
#5 3 A 0 0
#6 3 C 0 0
data
DF <- structure(list(ID = c(1L, 1L, 1L, 2L, 3L, 3L), V1 = c("B", "A",
"C", "C", "A", "C"), V2 = c(0L, 1L, 0L, 0L, 0L, 0L)), .Names = c("ID",
"V1", "V2"), class = "data.frame", row.names = c(NA, -6L))
Here's a tidyverse solution.
If V2 can only be 0 or 1:
library(dplyr)
df %>%
group_by(ID) %>%
mutate(V3 = max(V2))
If you want to check that V2 is exactly 1.
df %>%
group_by(ID) %>%
mutate(V3 = as.numeric(any(V2 == 1)))
Another base R option is
df$V3 <- with(df, +(ID %in% which(rowsum(V2, ID) > 0)))

extract the row having column values length equal to 1 in R

my input data is
df
anger sad joy happy trust disgust
1 1 0 1 2 3 0
2 2 0 0 2 0 3
3 2 2 1 1 1 1
4 0 1 1 1 0 1
I want output like this
mydata
anger sad joy happy trust disgust col
1 1 0 1 2 3 0 trust
2 2 0 0 2 0 3 disgust
I want to extract max value colname from each row but output only those rows having only one max value colname and discard all other row with more than one colname.
i tried this
d1 <- df[!apply(df[-1], 1, function(x) anyDuplicated(x[x == max(x)])),]
but i am getting this
anger sad joy happy trust disgust
1 1 0 1 2 3 0
2 2 0 0 2 0 3
3 2 2 1 1 1 1
I don't want third row in the output.
Thanks for help in advance.
We can use max.col to get the index of columns for each row after subsetting the rows
d1 <- mydata[!apply(mydata[-1], 1, anyDuplicated),]
d1$out <- names(d1)[-1][max.col(d1[-1], 'first')]
d1
# zone_id v1 v2 v3 v4 out
#1 1 12 15 18 20 v4
#3 3 31 28 14 2 v1
#4 4 12 16 9 5 v2
#5 5 5 18 10 12 v2
Update
If the OP wanted to remove only the duplicate values of max values, then replace the first line with
d1 <- mydata[!apply(mydata[-1], 1, function(x) anyDuplicated(x[x == max(x)])),]
Update2
Based on the newdataset by the OP, we don't need to remove the first column as it is not an id column
d2 <- mydata1[!apply(mydata1, 1, function(x) anyDuplicated(x[x == max(x)])),]
d2$out <- names(d2)[max.col(d2, 'first')]
d2
# anger sad joy happy trust disgust out
#1 1 0 1 2 3 0 trust
#2 2 0 0 2 0 3 disgust
data
mydata1 <- structure(list(anger = c(1L, 2L, 2L, 0L), sad = c(0L, 0L, 2L,
1L), joy = c(1L, 0L, 1L, 1L), happy = c(2L, 2L, 1L, 1L), trust = c(3L,
0L, 1L, 0L), disgust = c(0L, 3L, 1L, 1L)), .Names = c("anger", "sad",
"joy", "happy", "trust", "disgust"), row.names = c(NA, 4L),
class = "data.frame")
you can try:
mydata %>%
select(-zone_id) %>%
mutate(mx = do.call(pmax, (.))) %>%
select(mx) %>%
cbind(mydata) %>%
mutate( flg = rowSums(. == mx)) %>%
filter(flg ==2) %>%
select(-flg) %>%
gather(key = out, value= v, -mx, -zone_id) %>%
filter(mx == v) %>%
select(zone_id, mx, out) %>%
left_join(mydata)
which gives:
zone_id mx out v1 v2 v3 v4
1 3 31 v1 31 28 2 2
2 4 16 v2 1 16 9 1
3 5 18 v2 5 18 10 12
4 1 20 v4 12 15 18 20

Resources