Cumulative sum under some conditions - r

I would like to make a cumulative sum of the variable "nbre_lignes" in order to have the resulting variable named cumulative_sum, i managed to dit but it's not automated.
Can someone help me to automate it?
library(FSA)
library(dplyr)
months.numeric <- lubridate:::months.numeric
strwr <- function(str) gsub(" ", "\n", str)
waterfall <- data.frame(table= strwr(c("Concaténation", "Doublons & NPI","DGC & CAR-REU", "BDD", paste("Répondants",format(as.yearqtr(Sys.Date()-base::months(12)),"T%q"), "&", format(as.yearqtr(Sys.Date()-months(9)),"T%q-%y")), paste("Sollicités au ", format(as.yearqtr(Sys.Date()-months(3)),"T%q-%y")), "Exclusions ", "QD", "Cible Finale")),
nbre_lignes=c(638334, -362769, -17674,41927,-1540, -20149, -300, -10, 19928))
#
waterfall$time <- 1:nrow(waterfall)
waterfall$flow <- factor(sign(waterfall$nbre_lignes))
waterfall$table <- factor(waterfall$table, levels = waterfall[["table"]])
b <- pcumsum(waterfall$nbre_lignes[1:3])
l <- pcumsum(waterfall$nbre_lignes[4:8])
cumulative_sum <- c(b,l, 0)
waterfall <- waterfall %>% cbind(cumulative_sum)
table nbre_lignes time flow cumulative_sum
1 Concaténation 638334 1 1 0
2 Doublons\n&\nNPI -362769 2 -1 638334
3 DGC\n&\nCAR-REU -17674 3 -1 275565
4 BDD 41927 4 1 0
5 Répondants\nT3\n&\nT4-18 -1540 5 -1 41927
6 Sollicités\nau\n\nT2-19 -20149 6 -1 40387
7 Exclusions\n -300 7 -1 20238
8 QD -10 8 -1 19938
9 Cible\nFinale 19928 9 1 0

We can form a grouping variable using cumsum(flow == 1) as shown:
waterfall %>%
group_by(grp = cumsum(flow == 1)) %>%
mutate(cumsum = lag(cumsum(nbre_lignes), default = 0 )) %>%
ungroup %>%
select(- grp)
giving:
# A tibble: 9 x 5
table nbre_lignes time flow cumsum
<fct> <dbl> <int> <fct> <dbl>
1 Concaténation 638334 1 1 0
2 "Doublons\n&\nNPI" -362769 2 -1 638334
3 "DGC\n&\nCAR-REU" -17674 3 -1 275565
4 BDD 41927 4 1 0
5 "Répondants\nT3\n&\nT4-18" -1540 5 -1 41927
6 "Sollicités\nau\n\nT2-19" -20149 6 -1 40387
7 "Exclusions\n" -300 7 -1 20238
8 QD -10 8 -1 19938
9 "Cible\nFinale" 19928 9 1 0

Related

Retaining values from one row to the next

I have a data frame with one variable, x. I want to create a new variable y which is equal to 1 when x decreases by 2 from its previous value and equal to 0 otherwise. Then I want to create a variable z which holds the value of x when y was last equal to 1. I want the initial value of z to be 0. I haven't been able to figure out how to make z. Any advice?
Here's what I'm trying to obtain (but for about 1000 rows):
x y z
9 0 0
8 0 0
6 1 6
9 0 6
7 1 7
5 1 5
I've tried lags, cum functions in dplyr to no avail.
library(dplyr)
library(tidyr)
df <- data.frame(x = c(9,8,6,10,9,7,5))
df %>%
mutate(y = +(lag(x, default = x[1]) - x == 2),
z = ifelse(cumsum(y) > 0 & y == 0, NA, x * y)) %>%
fill(z, .direction = "down")
#> x y z
#> 1 9 0 0
#> 2 8 0 0
#> 3 6 1 6
#> 4 10 0 6
#> 5 9 0 6
#> 6 7 1 7
#> 7 5 1 5
Created on 2022-11-07 by the reprex package (v2.0.1)
One option:
df$y = 0L
df$y[-1] = (diff(df$x) == -2L)
df$z = data.table::nafill(ifelse(df$y == 1L, df$x, NA), "locf", fill = 0L)
# x y z
# 1 9 0 0
# 2 8 0 0
# 3 6 1 6
# 4 9 0 6
# 5 7 1 7
# 6 5 1 5
Reproducible data (please provide next time)
df = data.frame(x = c(9L,8L,6L,9L,7L,5L))
Here's a simple way to do it using dplyr.
library(dplyr)
tmp = data.frame(x = c(9,8,6,9,7,5))
tmp %>%
mutate(y = ifelse(lag(x) - x == 2, 1, 0)) %>%
mutate(z = ifelse(y == 1, x, lag(x))) %>%
replace(is.na(.), 0)
# output
# x y z
# 1 9 0 0
# 2 8 0 0
# 3 6 1 6
# 4 9 0 6
# 5 7 1 7
# 6 5 1 5

Get number of ties ego and alter have in common in R

I have a directed network dataset of adolescent friendships. I'd like to make an edgelist that includes the number of friends ego has in common with alter (someone ego and alter both nominated as a friend). Below is some sample data:
HAVE DATA:
id alter
1 3
1 5
1 9
2 3
2 5
3 2
3 5
3 9
3 6
WANT DATA:
id alter num_common
1 3 2
1 5 0
1 9 0
2 3 1
2 5 0
3 2 1
3 5 0
3 9 0
3 6 0
A solution could be to transform the edgelist into an adjacency matrix (using the igraph package) and multiple it by its transpose to count the number of shared neighbors:
el <- read.table(text= " id alter
1 3
1 5
1 9
2 3
2 5
3 2
3 5
3 9
3 6", header =T)
g <- graph_from_edgelist(as.matrix(el), directed = T)
m <- get.adjacency(g, sparse = F)
m2 <- m %*% t(m)
Afterwards transform the resulting matrix back to an edgelist and merge it with the original data set:
el2 <- reshape2::melt(m2)
dplyr::left_join(el, el2, by = c("id" = "Var1", "alter" = "Var2"))
id alter value
1 1 3 2
2 1 5 0
3 1 9 0
4 2 3 1
5 2 5 0
6 3 2 1
7 3 5 0
8 3 9 0
9 3 6 0
To see who how often ego and alter were both nominated by the same friend change the direction of the relation by using t(m) %*% m instead of m %*% t(m). To ignore direction, set the directed argument to FALSE in the graph_from_edgelist function.
this is a possible though not very simple solution:
# your dummy data
df <- data.table::fread("id alter
1 3
1 5
1 9
2 3
2 5
3 2
3 5
3 9
3 6")
library(dplyr)
library(tidyr)
# all pairs vertically with pair ID
pairs_v <- combn(unique(c(df$id, df$alter)), 2) %>%
dplyr::as_tibble() %>%
tidyr::pivot_longer(cols = everything()) %>%
dplyr::arrange(name)
# number of comon friends per group ID
pairs_comp <- pairs_v %>%
dplyr::left_join(df, by = c("value" = "id")) %>%
dplyr::count(name, alter) %>%
dplyr::filter(n > 1 & !is.na(alter)) %>%
dplyr::count(name)
# all pairs horizontally with pair ID
pairs_h <-pairs_v %>%
dplyr::group_by(name) %>%
dplyr::mutate(G_ID = dplyr::row_number()) %>%
tidyr::pivot_wider(names_from = G_ID, values_from = "value")
# multiple left joins to get repeated comon friends for each direction of combination
df %>%
dplyr::left_join(pairs_h, by = c("id" = "1", "alter" = "2")) %>%
dplyr::left_join(pairs_comp) %>%
dplyr::left_join(pairs_h, by = c("id" = "2", "alter" = "1")) %>%
dplyr::left_join(pairs_comp, by = c("name.y" = "name")) %>%
dplyr::mutate(num_common = case_when(!is.na(n.x) ~ as.numeric(n.x),
!is.na(n.y) ~ as.numeric(n.y),
TRUE ~ 0)) %>%
dplyr::select(id, alter, num_common)
id alter num_common
1: 1 3 2
2: 1 5 0
3: 1 9 0
4: 2 3 1
5: 2 5 0
6: 3 2 1
7: 3 5 0
8: 3 9 0
9: 3 6 0

transform count table into disaggregated table of observations

I have data in the form of a count table of successes and trials, but for modeling I need these data in a disaggregated trial-level table.
How do I get from this:
dplyr::tibble(
user_id = c(1,2),
success = c(3,4),
trials = c(9, 10)
)
To this:
dplyr::tibble(
user_id = c(rep(1, 9), rep(2, 10)),
success = c(rep(1, 3),rep(0, 6), rep(1, 4), rep(0, 6))
)
We can uncount based on the 'trials', then grouped by 'user_id', change the 'success' to binary by creating a logical condition with row_number
library(dplyr)
library(tidyr)
df1 %>%
uncount(trials) %>%
group_by(user_id) %>%
mutate(success = +(row_number() <= first(success))) %>%
ungroup
# A tibble: 19 x 2
# user_id success
# <dbl> <int>
# 1 1 1
# 2 1 1
# 3 1 1
# 4 1 0
# 5 1 0
# 6 1 0
# 7 1 0
# 8 1 0
# 9 1 0
#10 2 1
#11 2 1
#12 2 1
#13 2 1
#14 2 0
#15 2 0
#16 2 0
#17 2 0
#18 2 0
#19 2 0
Or with base R using Map and stack
stack(setNames(Map(function(x, y) rep(1:0, c(x, y)),
df1$success, df1$trials - df1$success), df1$user_id))[2:1]

Computing minimum distance between a row and all previous rows in R

I want to compute the minimum distance between the current row and every row before it within each group. My data frame has several groups, and each group has multiple dates with longitude and latitude. I use a Haversine function to compute distance, and I need to apply this function as described above. The data frame looks like the following:
grp date long lat rowid
1 1 1995-07-01 11 12 1
2 1 1995-07-05 3 0 2
3 1 1995-07-09 13 4 3
4 1 1995-07-13 4 25 4
5 2 1995-03-07 12 6 1
6 2 1995-03-10 3 27 2
7 2 1995-03-13 34 8 3
8 2 1995-03-16 25 9 4
My current attempt uses purrrlyr::by_row, but the method is too slow. In practice, each group has thousands of dates and geographic positions. Here is part of my current attempt:
calc_min_distance <- function(df, grp.name, row){
df %>%
filter(
group_name==grp.name
) %>%
filter(
row_number() <= row
) %>%
mutate(
last.lat = last(lat),
last.long = last(long),
rowid = 1:n()
) %>%
group_by(rowid) %>%
purrrlyr::by_row(
~haversinedistance.fnct(.$last.long, .$last.lat, .$long, .$lat),
.collate='rows',
.to = 'min.distance'
) %>%
filter(
row_number() < n()
) %>%
summarise(
min = min(min.distance)
) %>%
.$min
}
df_dist <-
df %>%
group_by(grp_name) %>%
mutate(rowid = 1:n()) %>%
group_by(grp_name, rowid) %>%
purrrlyr::by_row(
~calc_min_distance(df, .$grp_name,.$rowid),
.collate='rows',
.to = 'min.distance'
) %>%
ungroup %>%
select(-rowid)
Suppose that distance is defined as (lat + long) for reference row - (lat + long) for each pairwise row less than the reference row. My expected output for grp 1 is the following:
grp date long lat rowid min.distance
1 1 1995-07-01 11 12 1 0
2 1 1995-07-05 3 0 2 -20
3 1 1995-07-09 13 4 3 -6
4 1 1995-07-13 4 25 4 6
How can I quickly compute the minimum distance between the current rowid and all rowids before it?
Here's how I would go about it. You need to calculate all the within-group pair-wise distances anyway, so we'll use geosphere::distm which is designed to do just that. I'd suggest stepping through my function line-by-line and looking at what it does, I think it will make sense.
library(geosphere)
find_min_dist_above = function(long, lat, fun = distHaversine) {
d = distm(x = cbind(long, lat), fun = fun)
d[lower.tri(d, diag = TRUE)] = NA
d[1, 1] = 0
return(apply(d, MAR = 2, min, na.rm = TRUE))
}
df %>% group_by(grp) %>%
mutate(min.distance = find_min_dist_above(long, lat))
# # A tibble: 8 x 6
# # Groups: grp [2]
# grp date long lat rowid min.distance
# <int> <fct> <int> <int> <int> <dbl>
# 1 1 1995-07-01 11 12 1 0
# 2 1 1995-07-05 3 0 2 1601842.
# 3 1 1995-07-09 13 4 3 917395.
# 4 1 1995-07-13 4 25 4 1623922.
# 5 2 1995-03-07 12 6 1 0
# 6 2 1995-03-10 3 27 2 2524759.
# 7 2 1995-03-13 34 8 3 2440596.
# 8 2 1995-03-16 25 9 4 997069.
Using this data:
df = read.table(text = ' grp date long lat rowid
1 1 1995-07-01 11 12 1
2 1 1995-07-05 3 0 2
3 1 1995-07-09 13 4 3
4 1 1995-07-13 4 25 4
5 2 1995-03-07 12 6 1
6 2 1995-03-10 3 27 2
7 2 1995-03-13 34 8 3
8 2 1995-03-16 25 9 4', h = TRUE)

How to remove zero values until the first non-zero value occurs in an R dataframe?

The title says it all! I have grouped data where I'd like to remove rows up until the first 0 value by id group.
Example code:
problem <- data.frame(
id = c(1,1,1,1,2,2,2,2,3,3,3,3),
value = c(0,0,2,0,0,8,4,2,1,7,6,5)
)
solution <- data.frame(
id = c(1,1,2,2,2,3,3,3,3),
value = c(2,0,8,4,2,1,7,6,5)
)
Here is a dplyr solution:
library(dplyr)
problem %>%
group_by(id) %>%
mutate(first_match = min(row_number()[value != 0])) %>%
filter(row_number() >= first_match) %>%
select(-first_match) %>%
ungroup()
# A tibble: 9 x 2
id value
<dbl> <dbl>
1 1 2
2 1 0
3 2 8
4 2 4
5 2 2
6 3 1
7 3 7
8 3 6
9 3 5
Or more succinctly per Tjebo's comment:
problem %>%
group_by(id) %>%
filter(row_number() >= min(row_number()[value != 0])) %>%
ungroup()
You can do this in base R:
subset(problem,ave(value,id,FUN=cumsum)>0)
# id value
# 3 1 2
# 4 1 0
# 6 2 8
# 7 2 4
# 8 2 2
# 9 3 1
# 10 3 7
# 11 3 6
# 12 3 5
Use abs(value) if you have negative values in your real case.

Resources