Generate random binary variable conditionally in R - r

I would like to add an extra column, z based on the following conditions:
if x == "A", generate a binary variable assuming the prob of success (=1) is 0.5
if x == "C" & y == "N", generate a binary variable assuming the prob of success is 0.25.
# Sample data
df <- tibble(
x = ("A", "C", "C", "B", "C", "A", "A"),
y = ("Y", "N", "Y", "N", "N", "N", "Y"))
Currently, my approach uses filter, then set.seed and rbinom, and finally rbind. But I am looking for a more elegant solution that doesn't involve subseting and re-joining the data.

This is a good case for dplyr::case_when since you are using tidyverse functions.
library(dplyr)
set.seed(1)
df %>%
mutate(z = case_when(x == "A" ~ rbinom(n(), 1, 0.5),
x == "C" & y == "N" ~ rbinom(n(), 1, 0.25)))
# A tibble: 7 x 3
# Rowwise:
x y z
<chr> <chr> <int>
1 A Y 0
2 C N 1
3 C Y NA
4 B N NA
5 C N 0
6 A N 0
7 A Y 1

You may put your logic into a simple if / else structure and wrap it in a function g().
g <- \(z) {
if (z['x'] == 'A') {
rbinom(1, 1, .5)
}
else if (z['x'] == 'C' & z['y'] == 'N') {
rbinom(1, 1, .25)
} else {
NA
}
}
set.seed(42)
transform(df, z=apply(df, 1, g))
# x y z
# 1 A Y 1
# 2 C N 1
# 3 C Y NA
# 4 B N NA
# 5 C N 0
# 6 A N 1
# 7 A Y 1

You can try nested ifelse like below
transform(
df,
z = suppressWarnings(
rbinom(
nrow(df), 1,
ifelse(x == "A", 0.5,
ifelse(x == "C" & y == "N", 0.25, NA)
)
)
)
)
which gives
x y z
1 A Y 1
2 C N 0
3 C Y NA
4 B N NA
5 C N 1
6 A N 1
7 A Y 1

Related

R how to identify duplicate rows in all(multiple columns) and or(multiple columns)?

I want to identify duplicate rows in a data frame based on two types of conditions:
1: all(multiple columns), all the elements in the multiple columns should be the same.
2: any(multiple columns), at least one of the elements in the multiple columns should be the same
3: both 1 and 2 should fit.
getRepplicate <- function(df, allCol = "", anyCol = "") {
# both condition 1 and condition 2 are fit, then they are considered as replicated rows
}
For example:
df <- data.frame(
a = c(1, 1, 2, 3, 4, 1, 1, 3),
b = c(1, 2, 2, 3, 4, 1, 1, 3),
d = c("x", "y", "z", "x", "x", "y", "x", "x"),
e = c("x", "y", "z", "x", "x", "x", "z", "x")
)
> df
a b d e
1 1 1 x x
2 1 2 y y
3 2 2 z z
4 3 3 x x
5 4 4 x x
6 1 1 y x
7 1 1 x z
8 3 3 x x
If I apply this function df2 <- getRepplicate(df, allCol = c("a", "b"), anyCol = c("d", "e")), my expected result will be:
> df2
a b d e isReplicate
1 1 1 x x TRUE
2 1 2 y y FALSE
3 2 2 z z FALSE
4 3 3 x x TRUE
5 4 4 x x FALSE
6 1 1 y x TRUE
7 1 1 x z TRUE
8 3 3 x x TRUE
Thanks for your help.
Perhaps this:
df %>%
mutate(
dupall = duplicated(.) | duplicated(., fromLast = TRUE),
dup3 = rowSums(sapply(., function(z) duplicated(z) | duplicated(z, fromLast = TRUE))) > 3
)
# a b d e dupall dup3
# 1 1 1 x x FALSE TRUE
# 2 1 2 y y FALSE FALSE
# 3 2 2 z z FALSE FALSE
# 4 3 3 x x TRUE TRUE
# 5 4 4 x x FALSE FALSE
# 6 1 1 y x FALSE TRUE
# 7 1 1 x z FALSE TRUE
# 8 3 3 x x TRUE TRUE
The dup3 column reflects where 3 or more columns have duplicates. It doesn't indicate which of the columns are dupes.
A slightly tongue in cheek approach with group_by
library(dplyr)
df %>%
group_by(a) %>% mutate(o = n() > 1) %>%
group_by(b) %>% mutate(p = n() > 1) %>%
group_by(d) %>% mutate(q = n() > 1) %>%
group_by(e) %>% mutate(r = n() > 1) %>%
rowwise() %>%
mutate(isReplicate = all(across(o:r))) %>%
ungroup() %>%
select(-(o:r))
# A tibble: 8 × 5
a b d e isReplicate
<dbl> <dbl> <chr> <chr> <lgl>
1 1 1 x x TRUE
2 1 2 y y FALSE
3 2 2 z z FALSE
4 3 3 x x TRUE
5 4 4 x x FALSE
6 1 1 y x TRUE
7 1 1 x z TRUE
8 3 3 x x TRUE
# Define function
checkRep <- function(df, col_all, col_any) {
#(1) check input
if(is.null(col_all) & is.null(col_any))stop("At least one column should be selected.")
#(2) check condition 1.
if(is.null(col_all)){
df$allReplicate = TRUE
} else{
df <- df %>%
group_by(across(col_all)) %>%
mutate(allReplicate = n() > 1) %>%
ungroup()
}
#(3) check condition 2.
if(is.null(col_any)){
Final <- cbind.data.frame(df, anyReplicate = TRUE)
} else{
n <- length(col_any)
m <- dim(df)[1]
tem <- data.frame(matrix(ncol = n, nrow = m))
for(i in 1:n)
tem[, i] <- df %>%
group_by(!!sym(col_any[i])) %>%
mutate(x = n() > 1) %>%
ungroup() %>%
select(x)
tem2 <- tem %>%
rowwise() %>%
mutate(anyReplicate = any(colSums(.) > 0)) %>%
select(anyReplicate)
Final <- cbind.data.frame(df, tem2)
}
#(4) check both condition 1 & 2.
Final <- Final %>%
rowwise() %>%
mutate(isReplicate = ifelse(isTRUE(allReplicate) & isTRUE(anyReplicate), TRUE, FALSE)) %>%
select(-c(allReplicate, anyReplicate)) %>%
relocate(isReplicate)
return(Final)
}
# Check the result
df <- data.frame(
a = c(1, 1, 2, 3, 4, 1, 1, 3),
b = c(1, 2, 2, 3, 4, 1, 1, 3),
d = c("x", "y", "z", "x", "x", "y", "x", "x"),
e = c("x", "y", "z", "x", "x", "x", "z", "x")
)
col_all <- c("a", "b")
col_any <- c("d", "e")
checkRep(df, col_all, col_any)

Estimating the percentage of common set elements over combined past periods in a panel

I have a time-series panel dataset that is structured in the following way: There are 2 funds that each own different stocks at each time period.
df <- data.frame(
fund_id = c(1,1,1,1,1,1,1,1, 1, 2,2,2,2),
time_Q = c(1,1,1,2,2,2,2,3, 3, 1,1,2,2),
stock_id = c("A", "B", "C", "A", "C", "D", "E", "D", "E", "A", "B", "B", "C")
)
> df
fund_id time_Q stock_id
1 1 1 A
2 1 1 B
3 1 1 C
4 1 2 A
5 1 2 C
6 1 2 D
7 1 2 E
8 1 3 D
9 1 3 E
10 2 1 A
11 2 1 B
12 2 2 B
13 2 2 C
For each fund, I would like to calculate the percentage of stocks held in that current time_Q that were also ever held in any of the previous one to 2 quarters. So basically for every fund and every time_Q, I would like to have 2 columns with past 1 time_Q, past 1-2 time_Q which show what percentage of stocks held in that time were also present in any of that past time_Qs.
Here is what the result should look like:
result <- data.frame(
fund_id = c(1,1,1,2,2),
time_Q = c(1,2,3,1,2),
past_1Q = c("NA",0.5,1,"NA",0.5),
past_2Q = c("NA",0.5,1,"NA",0.5)
)
> result
fund_id time_Q past_1Q past_1_2Q
1 1 1 NA NA
2 1 2 0.5 0.5
3 1 3 1 1
4 2 1 NA NA
5 2 2 0.5 0.5
I already asked a similar question here, but now I'm looking for common elements across any of the past lagged periods. I'm looking for a dplyr or data.table scalable solution where I can have around 12 past quarters and deal with multiple funds and stocks and time periods.
Thanks in advance!
my solution
# dummy data
df <- data.table(fund_id = c(1,1,1,1,1,1,1,1, 1, 2,2,2,2)
, time_Q = c(1,1,1,2,2,2,2,3, 3, 1,1,2,2)
, stock_id = c("A", "B", "C", "A", "C", "D", "E", "D", "E", "A", "B", "B", "C")
); df
# lower case col names
names(df) <- tolower(names(df))
# unique grouping
x <- df[, .(dummy =.N), .(fund_id, time_q)][, dummy := NULL]
# initialise empty table
y <- NULL
# loop
for(i in 1:nrow(x))
{
# current quarter & before
z <- df[fund_id == x[i, fund_id]
& time_q %between% c( x[i, time_q] - 12, x[i, time_q])
]
# current quarter
a <- z[fund_id == x[i, fund_id]
& time_q == x[i, time_q]
, unique(stock_id)
]
# minus 1 to minus 12 quarter (lapply)
b <- lapply(1:12, \(j) z[fund_id == x[i, fund_id]
& time_q %between% c( x[i, time_q] - j, x[i, time_q] - 1)
, unique(stock_id)
]
)
# results
c <- data.table(fund_id = x[i, fund_id]
, current_q = x[i, time_q]
)
# no. of stocks in current quarter
d <- length(a)
# calculate % for the 12 periods
c[, paste0('past_1_to_', 1:12, '_q') := lapply(1:12, \(j) length(intersect(a,b[[j]])) / d) ]
# collect results
y <- rbind(y, c)
}
benchmark
x <- 1e3
df <- data.table(fund_id = rep(1:x, each = x/10)
, time_Q = rep(1:4, each = x/4)
, stock_id = sample(letters[1:26], size=20, replace=T)
)
took 20 seconds on the above df with 100k rows and 1,200 groups (fund_id, time_q)

Remove all records that have duplicates based on more than one variables

I have data like this
df <- data.frame(var1 = c("A", "A", "B", "B", "C", "D", "E"), var2 = c(1, 2, 3, 4, 5, 5, 6 ))
# var1 var2
# 1 A 1
# 2 A 2
# 3 B 3
# 4 B 4
# 5 C 5
# 6 D 5
# 7 E 6
A is mapped to 1, 2
B is mapped to 3, 4
C and D are both mapped to 5 (and vice versa: 5 is mapped to C and D)
E is uniquely mapped to 6 and 6 is uniquely mapped to E
I would like filter the dataset so that only
var1 var2
7 E 6
is returned. base or tidyverse solution are welcomed.
I have tried
unique(df$var1, df$var2)
df[!duplicated(df),]
df %>% distinct(var1, var2)
but without the wanted result.
Using igraph::components.
Represent data as graph and get connected components:
library(igraph)
g = graph_from_data_frame(df)
cmp = components(g)
Grab components where cluster size (csize) is 2. Output vertices as a two-column character matrix:
matrix(names(cmp$membership[cmp$membership %in% which(cmp$csize == 2)]),
ncol = 2, dimnames = list(NULL, names(df))) # wrap in as.data.frame if desired
# var1 var2
# [1,] "E" "6"
Alternatively, use names of relevant vertices to index original data frame:
v = names(cmp$membership[cmp$membership %in% which(cmp$csize == 2)])
df[df$var1 %in% v[1:(length(v)/2)], ]
# var1 var2
# 7 E 6
Visualize the connections:
plot(g)
Using a custom function to determine if the mapping is unique you could achieve your desired result like so:
df <- data.frame(
var1 = c("A", "A", "B", "B", "C", "D", "E"),
var2 = c(1, 2, 3, 4, 5, 5, 6)
)
is_unique <- function(x, y) ave(as.numeric(factor(x)), y, FUN = function(x) length(unique(x)) == 1)
df[is_unique(df$var2, df$var1) & is_unique(df$var1, df$var2), ]
#> var1 var2
#> 7 E 6
Another igraph option
decompose(graph_from_data_frame(df)) %>%
subset(sapply(., vcount) == 2) %>%
sapply(function(g) names(V(g)))
which gives
[,1]
[1,] "E"
[2,] "6"
A base R solution:
df[!(duplicated(df$var1) | duplicated(df$var1, fromLast = TRUE) |
duplicated(df$var2) | duplicated(df$var2, fromLast = TRUE)), ]
var1 var2
7 E 6

Find point in dataframe where (col_1[ i ], col_2[ i ]) = (col_1[ j ], -col_2[ j ])

There might be an obvious solution to this that I have missed but here goes:
Consider the data frame below. I wish to create a column with TRUE/FALSE values, where the value is TRUE whenever the condition (col_1[i], col_2[i]) = (col_1[j], -col_2[j]) is fulfilled. Note that sum() does not work here, since there might be a third value.
To elaborate; what I have is:
col_1 <- c("x", "x", "y", "y", "y", "z", "z")
col_2 <- c(-1, 1, 3, -3, 4, 7, 3)
df <- data.frame(col_1, col_2)
What I want is:
I think the answer must be something with df %>% group_by(x), but I can't think of the complete solution.
Here is my attempt. As you were saying, grouping data is necessary. I defined groups with col_1 and foo. foo contains absolute values of col_2. If the number of observation is larger than one and unique number of observation in col_2 is equal to 2, you have the pairs you are searching.
group_by(df, col_1, foo = abs(col_2)) %>%
mutate(check = n() > 1 & n_distinct(col_2) == 2) %>%
ungroup %>%
select(-foo)
col_1 col_2 check
<fct> <dbl> <lgl>
1 x -1 TRUE
2 x 1 TRUE
3 y 3 TRUE
4 y -3 TRUE
5 y 4 FALSE
6 z 7 FALSE
7 z 3 FALSE
As Ronak previously mentioned, there may be cases like this.
col_1 <- c("x", "x", "y", "y", "y", "z", "z")
col_2 <- c(1, 1, 3, -3, 4, 7, 3)
df2 <- data.frame(col_1, col_2)
col_1 col_2
1 x 1
2 x 1
3 y 3
4 y -3
5 y 4
6 z 7
7 z 3
group_by(df2, col_1, foo = abs(col_2)) %>%
mutate(check = n() > 1 & n_distinct(col_2) == 2) %>%
ungroup %>%
select(-foo)
col_1 col_2 check
<fct> <dbl> <lgl>
1 x 1 FALSE
2 x 1 FALSE
3 y 3 TRUE
4 y -3 TRUE
5 y 4 FALSE
6 z 7 FALSE
7 z 3 FALSE
You can try the following base R code, where a custom function f is defined to check the sum:
f <- function(v) {
unique(c(combn(seq(v),2)[,combn(v,2,sum)==0]))
}
dfout <- Reduce(rbind,
lapply(split(df,df$col_1),
function(v) {
v$col_3 <- F
v$col_3[f(v$col_2)] <- T
v
})
)
dfout <- dfout[order(as.numeric(rownames(dfout))),]
such that
> dfout
col_1 col_2 col_3
1 x -1 TRUE
2 x 1 TRUE
3 y 3 TRUE
4 y -3 TRUE
5 y 4 FALSE
6 z 7 FALSE
7 z 3 FALSE

Sum values of combinations within a group

For a analysis I would like to transform data from:
data <- data.frame(
Customer = c("A", "A", "B", "B", "C", "C", "C"),
Product = c("X", "Y", "X", "Z", "X", "Y", "Z"),
Value = c(10, 15, 5, 10, 20, 5, 10)
)
data
# Customer Product Value
# 1 A X 10
# 2 A Y 15
# 3 B X 5
# 4 B Z 10
# 5 C X 20
# 6 C Y 5
# 7 C Z 10
To:
Product Product Sum Value
-------|-------|---------
X |Y |50
X |Z |45
Y |Z |15
Basically I want to get the sum of the value for every product combination within a customer. I guess it could work with some help of the reshape package but I cannot get it to work.
Thanks for your time.
Here is one way, in two steps:
1) transform your data into a long data.frame of all pairs within customers. For that, I rely on combn to provide the indices of all possible pairs:
process.one <- function(x) {
n <- nrow(x)
i <- combn(n, 2)
data.frame(Product1 = x$Product[i[1, ]],
Product2 = x$Product[i[2, ]],
Value = x$Value[i[1, ]] +
x$Value[i[2, ]])
}
library(plyr)
long <- ddply(data, "Customer", process.one)
long
# Customer Product1 Product2 Value
# 1 A X Y 25
# 2 B X Z 15
# 3 C X Y 25
# 4 C X Z 30
# 5 C Y Z 15
2) drop the Customer dimension and aggregate your values:
aggregate(Value ~ ., long[c("Product1", "Product2", "Value")], sum)
# Product1 Product2 Value
# 1 X Y 50
# 2 X Z 45
# 3 Y Z 15

Resources