identify change in row observation values - r

I am interested in identifying individuals that go from a 0 to a .5 or a 1. Here is example data:
id x
1 0
1 1
1 1
2 0
2 .5
2 .5
3 0
3 0
I want a new df that indicates whether or not each person moved from a 0 to .5 or 1. Something like this;
id endorsed
1 TRUE
2 TRUE
3 FALSE
I tried using ifelse for this, but I just can't get the code right. Does anyone have any suggestions?

An option to do a group by 'id', check for any values in 'x' that is a 0 and the next value is either 0.5 or 1 (using lead) (Assume that the precision is right)
library(dplyr)
df1 %>%
group_by(id) %>%
summarise(endorsed = any(x == 0 & lead(x) %in% c( 0.5, 1)))
-output
# A tibble: 3 x 2
# id endorsed
#* <int> <lgl>
#1 1 TRUE
#2 2 TRUE
#3 3 FALSE
data
df1 <- structure(list(id = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L), x = c(0,
1, 1, 0, 0.5, 0.5, 0, 0)), class = "data.frame", row.names = c(NA,
-8L))

Since they always start with zero, you can just group and ask whether they ever hit 0.5 or 1:
df1 %>% group_by(id) %>% summarize(endorsed = any(x %in% c(0.5, 1)))

A base R option using aggregate
aggregate(
cbind(endorsed = x) ~ id,
df,
function(v) head(v, 1) == 0 & tail(cummax(v), 1) %in% c(.5, 1)
)
gives
id endorsed
1 1 TRUE
2 2 TRUE
3 3 FALSE

Related

cumsum with a condition to restart in R

I have this dataset containing multiple columns. I want to use cumsum() on a column conditioning the sum on another column. That is when X happens I want the sum to restart from zero but, I want to sum also the number of the "x" event row. I'll be more precise here in an example.
inv ass port G cumsum(G)
i x 2 1 1
i x 2 0 1
i x 0 1 2
i x 3 0 0
i x 3 1 1
So in the 3rd row the condition port == 0 happens. I want to cumsum(G), but on the 3rd row i want to still sum the value of G and to restart the count from the following row.
I'm using dplyr to group_by(investor, asset) but I'm stuck here since I'm doing:
res1 <- res %>%
group_by(investor, asset) %>%
mutate(posdays = ifelse(operation < 0 & portfolio == 0, 0, cumsum(G))) %>%
ungroup()
Since this restart the cumsum() but excludes the sum of the 3rd row.
I think something saying "cumsum(G) but when condition "x" in the previous row, restart the sum in the following row".
Can you help me?
You may use cumsum to create groups as well.
library(dplyr)
df <- df %>%
group_by(group = cumsum(dplyr::lag(port == 0, default = 0))) %>%
mutate(cumsum_G = cumsum(G)) %>%
ungroup
df
# inv ass port G group cumsum_G
# <chr> <chr> <int> <int> <dbl> <int>
#1 i x 2 1 0 1
#2 i x 2 0 0 1
#3 i x 0 1 0 2
#4 i x 3 0 1 0
#5 i x 3 1 1 1
You may remove the group column from output using %>% select(-group).
data
df <- structure(list(inv = c("i", "i", "i", "i", "i"), ass = c("x",
"x", "x", "x", "x"), port = c(2L, 2L, 0L, 3L, 3L), G = c(1L,
0L, 1L, 0L, 1L)), class = "data.frame", row.names = c(NA, -5L))

How to group by a variable and see if they have another observation within a given time frame, R

I have the something like the following:
person_ID visit date
1 2/25/2001
1 2/30/2001
1 4/2/2001
2 3/18/2004
3 9/22/2004
3 10/27/2004
3 5/15/2008
I want to add another column to see if the person has a reoccurring observation within 90 days, like:
person_ID visit date reoccurrence
1 2/25/2001 1
1 2/30/2001 1
1 4/2/2001 0
2 3/18/2004 0
3 9/22/2004 1
3 10/27/2004 0
3 5/15/2008 0
any help is appreciated, thank you!
If the second 'date' is not 2/30/2001, convert the 'visit_date' to Date class, grouped by 'person_id', get the difference between current and next 'visit_date' in 'day', check if it is less than 90, replace the NA with 0
library(dplyr)
library(lubridate)
library(tidyr)
df1 <- df1 %>%
mutate(visit_date = mdy(visit_date)) %>%
group_by(person_ID) %>%
mutate(reoccurrence = replace_na(+(difftime(lead(visit_date),
visit_date, units = 'day') < 90), 0)) %>%
ungroup
-output
# A tibble: 7 x 3
# person_ID visit_date reoccurrence
# <int> <date> <dbl>
#1 1 2001-02-25 1
#2 1 2001-02-28 1
#3 1 2001-04-02 0
#4 2 2004-03-18 0
#5 3 2004-09-22 1
#6 3 2004-10-27 0
#7 3 2008-05-15 0
Or using data.table
library(data.table)
setDT(df1)[, visit_date := as.IDate(visit_date, '%m/%d/%Y')
][, reoccurence := +(difftime(shift(visit_date, type = 'lead'),
visit_date, units = 'day') < 90))
][is.na(reoccurence), reoccurence := 0]
Or with base R
df1$visit_date <- as.Date(df1$visit_date, '%m/%d/%Y')
with(df1, ave(as.integer(visit_date), person_ID, FUN =
function(x) c(+(diff(x) < 90), 0)))
#[1] 1 1 0 0 1 0 0
data
df1 <- structure(list(person_ID = c(1L, 1L, 1L, 2L, 3L, 3L, 3L), visit_date = c("2/25/2001",
"2/28/2001", "4/2/2001", "3/18/2004", "9/22/2004", "10/27/2004",
"5/15/2008")), row.names = c(NA, -7L), class = "data.frame")
Base R variant:
reoccur <- function(x, lim=90) {
m <- outer(x, x, `-`)
m[upper.tri(m, diag=TRUE)] <- NA
colSums(!is.na(m) & m >= 0 & m <= lim) > 0
}
### make your dates *dates*
dat$visit <- as.Date(dat$visit, format="%m/%d/%Y")
### calculate if you have reoccurrences
ave(as.numeric(dat$visit), dat$person_ID, FUN=reoccur)
# [1] 1 1 0 0 1 0 0
Data:
dat <- structure(list(person_ID = c(1L, 1L, 1L, 2L, 3L, 3L, 3L), visit = c("2/25/2001", "2/27/2001", "4/2/2001", "3/18/2004", "9/22/2004", "10/27/2004", "5/15/2008")), class = "data.frame", row.names = c(NA, -7L))
(I changed "2/30/2001" to "2/27/2001" to get a real Date out of it.)

Getting column-wise means and standard deviations for positive and negative values separately in R

I can get column-wise means and standard deviations (sample) of a dataframe as follows:
means <- apply(df , 2, mean)
sdevs <- apply(df , 2, sd)
However, my dataframe contains positive and negative values and I need to get means and standard deviation for positive and negative values separately
Example Input:
COL1 COL2
1 1
2 1
3 1
-1 -1
-5 -1
-9 -1
Example Output:
positive_means = [2,1]
positive_sdevs = [1,0]
negative_means = [-5,-1]
negative_sdevs = [4,0]
I do not want to build a for loop because my data frame contain too much values and columns.
Thanks.
You can try this creating a group for positive and negative values and then summarise with dplyr functions:
library(dplyr)
#Code
new <- df %>% mutate(group=ifelse(COL1>0&COL2>0,'Pos','Neg')) %>%
group_by(group) %>% summarise_all(list('mean'=mean,'sd'=sd))
Output:
# A tibble: 2 x 5
group COL1_mean COL2_mean COL1_sd COL2_sd
<chr> <dbl> <dbl> <dbl> <dbl>
1 Neg -5 -1 4 0
2 Pos 2 1 1 0
Some data used:
#Data
df <- structure(list(COL1 = c(1L, 2L, 3L, -1L, -5L, -9L), COL2 = c(1L,
1L, 1L, -1L, -1L, -1L)), class = "data.frame", row.names = c(NA,
-6L))
Another option can be using apply() and rowSums():
#Code1
as.data.frame(apply(df[rowSums(df)>0,],2,function(x) {data.frame(Mean=mean(x),SD=sd(x))}))
Output:
COL1.Mean COL1.SD COL2.Mean COL2.SD
1 2 1 1 0
#Code2
as.data.frame(apply(df[!rowSums(df)>0,],2,function(x) {data.frame(Mean=mean(x),SD=sd(x))}))
Output:
COL1.Mean COL1.SD COL2.Mean COL2.SD
1 -5 4 -1 0
Here's another base R option to add to Duck's excellent answer:
as.data.frame(lapply(df, function(x) c(mean_pos = mean(x[x > 0]),
mean_neg = mean(x[x <= 0]),
sd_pos = sd(x[x > 0 ]),
sd_neg = sd(x[x <= 0]))))
#> COL1 COL2
#> mean_pos 2 1
#> mean_neg -5 -1
#> sd_pos 1 0
#> sd_neg 4 0

How to check previous row value with present row value in dataframe

How to check previous row value with present row value dynamically for all column of data frame by grouping specific ID.
my data frame:
ID ITEM1 ITEM2 ITEM3
1 A A A
2 C B C
1 A B C
1 B A C
2 NA B F
3 A A D
4 R G J
4 H T J
For Ex:
ID ITEM1 ITEM2 ITEM3 ITEM1change ITEM2change ITEM3change
1 A A A 0 0 0
1 A B C 0 1 1
1 B A C 1 1 0
2 C B C 0 0 0
2 NA B F 1 0 1
3 A A D 0 0 0
4 R G J 0 0 0
4 H T J 1 1 0
My final output will be:
Fiels modifiedcout unmodifiedcount Total
ITEM1change 3 5 8
ITEM2change 3 5 8
ITEM3change 2 6 8
my data:
structure(list(ID = c(1, 2, 1, 1, 2, 3, 4, 4), ITEM1 = structure(c(1L,
3L, 1L, 2L, NA, 1L, 5L, 4L), .Label = c("A", "B", "C", "H", "R"
), class = "factor"), ITEM2 = structure(c(1L, 2L, 2L, 1L, 2L,
1L, 3L, 4L), .Label = c("A", "B", "G", "T"), class = "factor"),
ITEM3 = structure(c(1L, 2L, 2L, 2L, 4L, 3L, 5L, 5L), .Label = c("A",
"C", "D", "F", "J"), class = "factor")), .Names = c("ID",
"ITEM1", "ITEM2", "ITEM3"), row.names = c(NA, -8L), class = "data.frame")
A possible solution:
library(dplyr)
library(tidyr)
df %>%
gather(item, value, -1) %>%
group_by(ID, item) %>%
mutate(change = lag(value, default = first(value)) != value,
change = replace(change, is.na(change), TRUE)) %>%
group_by(item) %>%
summarise(modified = sum(change, na.rm = TRUE),
unmodified = sum(!change, na.rm = TRUE)) %>%
mutate(total = modified + unmodified)
which gives:
# A tibble: 3 x 4
item modified unmodified total
<chr> <int> <int> <int>
1 ITEM1 3 5 8
2 ITEM2 3 5 8
3 ITEM3 2 6 8
Here is another idea using rollapply from zoo. By using rollapply with width = 2, we are testing if x is not equal with x-1. Wrapping it in as.integer gives 1s (TRUE) and 0s (FALSE). We then replace all NAs with 1 since you consider them as being modified, and use colSums to sum the modified/unmodified elements. The total is just the number of rows of the original data frame.
library(zoo)
m1 <- do.call(rbind, lapply(split(df, df$ID), function(i)
sapply(i[-1], function(j)
as.integer(c(FALSE, rollapply(j, 2, function(k) k[1] != k[2]))))))
m1 <- replace(m1, is.na(m1), 1)
#giving
# ITEM1 ITEM2 ITEM3
# 0 0 0
# 0 1 1
# 1 1 0
# 0 0 0
# 1 0 1
#3 0 0 0
# 0 0 0
# 1 1 0
To get your expected data frame,
final_df <- data.frame(modified = colSums(m1 == 1),
unmodified = colSums(m1 != 1),
Total = nrow(df), stringsAsFactors = FALSE)
which gives,
modified unmodified Total
ITEM1 3 5 8
ITEM2 3 5 8
ITEM3 2 6 8
The given data has many columns of the same type. This strongly suggests that the data better be stored in long format rather than in wide format.
Jaap's solution is reshaping the data using tidyr / dplyr.
However, I would like to suggest a data.tablesolution which does not reshape the data. In addition, it avoids to handle NA values separately.
library(data.table)
# coerce to data.table, loop over columns and determine changes to previous row by ID
tmp <- setDT(DF)[, lapply(.SD, function(x) x == shift(x, fill = x[1])), by = ID]
tmp
ID ITEM1 ITEM2 ITEM3
1: 1 TRUE TRUE TRUE
2: 1 TRUE FALSE FALSE
3: 1 FALSE FALSE TRUE
4: 2 TRUE TRUE TRUE
5: 2 NA TRUE FALSE
6: 3 TRUE TRUE TRUE
7: 4 TRUE TRUE TRUE
8: 4 FALSE FALSE TRUE
Now, we can count the unchanged rows:
tmp[, lapply(.SD, sum, na.rm = TRUE), .SDcols = -"ID"]
ITEM1 ITEM2 ITEM3
1: 5 5 6
From here, OP's expected result can be achieved in two different ways
using melt()
melt(tmp[, lapply(.SD, sum, na.rm = TRUE), .SDcols = -"ID"]
, measure.vars = patterns("^ITEM"),
variable.name = "item",
value.name = "unmodified")[
, c("modified", "Total") := .(nrow(DF) - unmodified, nrow(DF))][]
or by transposing:
as.data.table(
t(tmp[, lapply(.SD, sum, na.rm = TRUE), .SDcols = -"ID"])
, keep.rownames = "item")[, setnames(.SD, "V1", "unmodified")][
, c("modified", "Total") := .(nrow(DF) - unmodified, nrow(DF))][]
Both return the same result:
item unmodified modified Total
1: ITEM1 5 3 8
2: ITEM2 5 3 8
3: ITEM3 6 2 8
For the sake of completeness, here is also a data.table implementation of the reshape approach. As above, NA are handled by counting the unmodified rows first excluding any NA.
melt(setDT(DF), id.vars = "ID", variable.name = "item")[
, value == shift(value, fill = value[1L]), by = .(ID, item)][
, .(unmodified = sum(V1, na.rm = TRUE)), by = item][
, c("modified", "Total") := .(nrow(DF) - unmodified, nrow(DF))][]
If dat is your data, then try:
Create ITEMCHANGE variables
dat["ITEM1Change"] <- c(NA, head(dat["ITEM1"], dim(dat)[1] - 1)[[1]])
dat["ITEM2Change"] <- c(NA, head(dat["ITEM2"], dim(dat)[1] - 1)[[1]])
dat["ITEM3Change"] <- c(NA, head(dat["ITEM3"], dim(dat)[1] - 1)[[1]])
Then compare if there are changes
dat$ITEM1Change <- ifelse(dat$ITEM1Change == dat$ITEM1, 0, 1)
dat$ITEM2Change <- ifelse(dat$ITEM2Change == dat$ITEM2, 0, 1)
dat$ITEM3Change <- ifelse(dat$ITEM3Change == dat$ITEM2, 0, 1)
Then group and summarize
library(dplyr)
dat %>%
group_by("ITEM1") %>%
summarise_at(.funs = sum, .vars = "ITEM1Change") -> ITEM1Change
etc.
Is this what you need?

A clean way to aggregate/groupby involving only the distinct values in each column?

I've been using the dplyr package to create aggregated data tables, for example using the following code:
agg_data <- df %>%
select(calc.method, price1, price2) %>%
group_by(calc.method) %>%
summarize(
count = n(),
mean_price1 = round(mean(price1, na.rm = TRUE),2),
mean_price2 = round(mean(price2, na.rm = TRUE),2))
However, I would like to only calculate the mean over the distinct values of price1 and price2 within groups
e.g:
Price1: 1 1 2 1 2 2 1
Goes to (before aggregation):
Price1: 1 2 1 2 1
(and these in general don't have the same numbers of after removal for price1 and price2). I would also like to calculate a count for each (price1 and price2), counting only distinct values within groups. (Groups are defined as two or more identical values adjacent to each other)
I have tried:
agg_data <- df %>%
select(calc.method, price1, price2) %>%
group_by(calc.method) %>%
summarize(
count = n(),
mean_price1 = round(mean(distinct(price1), na.rm = TRUE),2),
mean_price2 = round(mean(distinct(price2), na.rm = TRUE),2))
And also tried wrapping the columns within the select function with distinct(), but both these throw errors.
Is there a way to do this using dplyr or another similar package without having to write something from scratch?
To satisfy your requirement for distinct, we need to remove successive values that are the same. For numeric vectors, this can be accomplished by:
x <- x[c(1, which(diff(x) != 0)+1)]
The default use of diff computes the difference between adjoining elements in the vector. We use this to detect successive values that are different, for which diff(x) != 0. Since the output differences are lagged by 1, we add 1 to the indices of these distinct elements, and we also want the first element as distinct. For example:
x <- c(1,1,2,1,2,2,1)
x <- x[c(1, which(diff(x) != 0)+1)]
##[1] 1 2 1 2 1
We can then use this with dplyr:
agg_data <- df %>% group_by(calc.method) %>%
summarize(count = n(),
count_non_rep_1 = length(price1[c(1,which(diff(price1) != 0)+1)]),
mean_price1 = round(mean(price1[c(1,which(diff(price1) != 0)+1)], na.rm=TRUE),2),
count_non_rep_2 = length(price2[c(1,which(diff(price2) != 0)+1)]),
mean_price2 = round(mean(price2[c(1,which(diff(price2) != 0)+1)], na.rm=TRUE),2))
or, better yet, define the function:
remove.repeats <- function(x) {
x[c(1,which(diff(x) != 0)+1)]
}
and use it with dplyr:
agg_data <- df %>% group_by(calc.method) %>%
summarize(count = n(),
count_non_rep_1 = length(remove.repeats(price1)),
mean_price1 = round(mean(remove.repeats(price1), na.rm=TRUE),2),
count_non_rep_2 = length(remove.repeats(price2)),
mean_price2 = round(mean(remove.repeats(price2), na.rm=TRUE),2))
Using this on some example data that is hopefully similar to yours:
df <- structure(list(calc.method = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor"),
price1 = c(1, 1, 2, 1, 2, 2, 1, 1, 1, 2, 2, 2, 2, 1, 3),
price2 = c(1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1)),
.Names = c("calc.method", "price1", "price2"), row.names = c(NA, -15L), class = "data.frame")
## calc.method price1 price2
##1 A 1 1
##2 A 1 1
##3 A 2 1
##4 A 1 1
##5 A 2 1
##6 A 2 1
##7 A 1 1
##8 B 1 2
##9 B 1 1
##10 B 2 2
##11 B 2 1
##12 B 2 2
##13 B 2 1
##14 B 1 2
##15 B 3 1
We get:
print(agg_data)
### A tibble: 2 x 6
## calc.method count count_non_rep_1 mean_price1 count_non_rep_2 mean_price2
## <fctr> <int> <int> <dbl> <int> <dbl>
##1 A 7 5 1.40 1 1.0
##2 B 8 4 1.75 8 1.5

Resources