Related
I have a dataset with several binary variables (x1-x5, values: 1, 2, NA). My goal is to identify whether pairs of binary variables have zero or very low cell counts in the cross-tab table (after ignoring the missing values). So, I would like to calculate the cross-tab table for each pair of binary variables in my data set, extract the lowest value from each table, and report the lowest value from each cross-table into a matrix. By doing so, I would have something similar to a correlation matrix where, instead of correlation coefficients, I would be able to look at the lowest cell count for each pair of variables. Below I created a toy dataset for anyone who will decide to help.
library(tidyverse)
df <- data.frame(x1 = rbinom(n = 1000, size = 1, prob = 0.5),
x2 = rbinom(n = 1000, size = 1, prob = 0.3),
x3 = rbinom(n = 1000, size = 1, prob = 0.4),
x4 = rbinom(n = 1000, size = 1, prob = 0.2),
x5 = rbinom(n = 1000, size = 1, prob = 0.05)) |>
mutate(across(everything(), ~as.factor(.))) |>
mutate(across(everything(), ~recode(., "1" = "2", "0" = "1")))
df1 <- as.data.frame(lapply(df, function(cc) cc[ sample(c(TRUE, NA), prob = c(0.85, 0.15), size = length(cc), replace = TRUE) ]))
I think this is what you mean. It's inefficient (we should only compute one triangle) but short.
cfun <- function(i, j) {
min(table(df[[i]], df[[j]]))
}
outer(1:ncol(df), 1:ncol(df), Vectorize(cfun))
If you want to be more efficient:
n <- ncol(df)
m <- matrix(NA_integer_, n, n, dimnames = list(names(df), names(df)))
for (i in 1:(n-1)) {
for (j in (i+1):n) {
m[j,i] <- cfun(i,j)
}
}
Someone (probably #dcsuka) suggested another solution but then deleted it from the answer section. Thankfully, I had already saved it in my script. After tweaking the code a tiny bit, it returned the correct results. So I am copying it here because, as Ben said, diversity is good.
df2 <- df1 %>%
colnames() %>%
combn(2) %>%
t() %>%
as_tibble(.name_repair = ~c("var1", "var2"))
df3 <- df2 %>%
rowwise() %>%
mutate(crosstab = list(as_tibble(table(select(df1, var1, var2)))),
value = min(list(select(crosstab, n))[[1]])) %>%
select(-crosstab) %>%
pivot_wider(names_from = var1, values_from = value)
I need to write a function involving subsetting a df by a variable n bins. Like, if n is 2, then subsample the df some number of times in two bins (from the first half, then from the second half). If n is 3, subsample in 3 bins (first 1/3, second 1/3, third 1/3). I've been doing this for different lengths of n manually so far, and I know there must be a better way to do it. I want to write it into a function with n as an input, but I can't make it work so far. Code below.
# create df
df <- data.frame(year = c(1:46),
sample = seq(from=10,to=30,length.out = 46) + rnorm(46,mean=0,sd=2) )
# real df has some NAs, so we'll add some here
df[c(20,32),2] <- NA
this df is 46 years of sampling. I want to pretend instead of 46 samples, I only took 2, but at one random year in the first half (1:23), and one random year in the second half (24:46).
# to subset in 2 groups, say, 200 times
# I'll make a df of elements to sample
samplelist <- data.frame(firstsample = sample(1:(nrow(df)/2),200,replace = T), # first sample in first half of vector
secondsample = sample((nrow(df)/2):nrow(df),200, replace = T) )# second sample in second half of vector
samplelist <- as.matrix(samplelist)
# start a df to add to
plot_df <- df %>% mutate(first='all',
second = 'all',
group='full')
# fill the df using coords from expand.grid
for(i in 1:nrow(samplelist)){
plot_df <<- rbind(plot_df,
df[samplelist[i,] , ] %>%
mutate(
first = samplelist[i,1],
second = samplelist[i,2],
group = i
))
print(i)
}
(If we can make it skip samples on "NA" sample years, that would be extra good).
So, if I wanted to do this for three points instead of two, I'd repeat the process like this:
# to subset in 3 groups 200 times
# I'll make a df of elements to sample
samplelist <- data.frame(firstsample = sample(1:(nrow(df)/3),200,replace = T), # first sample in first 1/3
secondsample = sample(round(nrow(df)/3):round(nrow(df)*(2/3)),200, replace = T), # second sample in second 1/3
thirdsample = sample(round(nrow(df)*(2/3)):nrow(df), 200, replace=T) # third sample in last 1/3
)
samplelist <- as.matrix(samplelist)
# start a df to add to
plot_df <- df %>% mutate(first='all',
second = 'all',
third = 'all',
group='full')
# fill the df using coords from expand.grid
for(i in 1:nrow(samplelist)){
plot_df <<- rbind(plot_df,
df[samplelist[i,] , ] %>%
mutate(
first = samplelist[i,1],
second = samplelist[i,2],
third = samplelist[i,3],
group = i
))
print(i)
}
but, I want to do this many times, sampling up to ~20 times (so in 20 bins), so this manual method is not sustainable. Can you help me write a function to say "pick one sample from n bins x times"?
btw, this is the plot I am making with the complete df:
plot_df %>%
ggplot(aes(x=year,y=sample)) +
geom_point(color="grey40") +
stat_smooth(geom="line",
method = "lm",
alpha=.3,
aes(color=group,
group=group),
se=F,
show.legend = F) +
geom_line(color="grey40") +
geom_smooth(data = plot_df %>% filter(group %in% c("full")),
method = "lm",
alpha=.7,
color="black",
size=2,
#se=F,
# fill="grey40
show.legend = F
) +
theme_classic()
If I got you right, the following function splits your df in n bins, draws x samples from each and puts the results back into cols of a df:
library(tidyverse)
set.seed(42)
df <- data.frame(year = c(1:46),
sample = seq(from=10,to=30,length.out = 46) + rnorm(46,mean=0,sd=2) )
get_df_sample <- function(df, n, x) {
df %>%
# bin df in n bins of (approx.) equal length
mutate(bin = ggplot2::cut_number(seq_len(nrow(.)), n, labels = seq_len(n))) %>%
# split by bin
split(.$bin) %>%
# sample x times from each bin
map(~ .x[sample(seq_len(nrow(.x)), x, replace = TRUE),]) %>%
# keep only column "sample"
map(~ select(.x, sample)) %>%
# Rename: Add number of df-bin from which sample is drawn
imap(~ rename(.x, !!sym(paste0("sample_", .y)) := sample)) %>%
# bind
bind_cols() %>%
# Add group = rownames
rownames_to_column(var = "group")
}
get_df_sample(df, 3, 200) %>%
head()
#> sample_1 sample_2 sample_3 group
#> 1 12.58631 18.27561 24.74263 1
#> 2 19.46218 24.24423 23.44881 2
#> 3 12.92179 18.47367 27.40558 3
#> 4 15.22020 18.47367 26.29243 4
#> 5 12.58631 24.24423 24.43108 5
#> 6 19.46218 23.36464 27.40558 6
Created on 2020-03-24 by the reprex package (v0.3.0)
Here's a function using loops, closer to what you started doing:
df <- data.frame(year = c(1:46),
sample = seq(from=10, to=30, length.out = 46) +
rnorm(46,mean=0,sd=2))
df[c(20,32), 2] <- NA
my_function <- function(n, sample_size, data = df) {
plot_df <- data %>% mutate(group = 'full')
sample_matrix <- matrix(data = NA, nrow = sample_size, ncol = n)
first_row <- 1 # First subset has 1 as first row, no matter how many subsets
for (i in 1:n) {
last_row <- round(first_row + nrow(df)/n - 1) # Determine last row of i-th subset
sample_matrix[, i] <- sample(first_row:last_row, sample_size, replace = T) # Store sample directly in matrix
first_row <- i + last_row # Determine first row for next i
group_name <- paste("group", i, sep = "_") # Column name for i-th group
plot_df[[group_name]] <- "all" # Column for i-th group
}
for (j in 1:sample_size) {
# Creating a new data frame for new observations
new_obs <- df[sample_matrix[j,], ]
new_obs[["group"]] <- j
for (group_n in 1:n) {
new_obs[[paste0("group_", group_n)]] <- sample_matrix[j, group_n]
}
plot_df <- rbind(plot_df, new_obs)
plot_df <<- plot_df
}
}
my_function(2, 200, data = df)
I want to plot fee as a percentage of income fee_per_inc for each income year_hh_inc quintile.
this is what I have so far:
pacman::p_load(RCurl, plm, tibble, ggplot2, AER, dplyr, car, arm, broom, tidyr, fastDummies, dummies)
x <- getURL("https://raw.githubusercontent.com/dothemathonthatone/maps/master/main_test.csv")
maindf <- read.csv(text = x, row.names=NULL)
maindf <- maindf %>%
mutate(category = cut(year_hh_inc, breaks = (quantile(year_hh_inc, c(0, 1 / 5, 2 / 5, 3 / 5, 4 / 5, 1), na.rm = TRUE)), labels = c("first_quint", "second_quint", "third_quint", 'fourth_quint', 'fifth_quint'), include.lowest = TRUE), vals = 1) %>%
pivot_wider(names_from = category, values_from = vals, values_fill = list(vals = 0))
box <- boxplot(maindf$year_hh_inc ~ maindf$fee_per_inc, col = 3:5)
This is what I would like as an end result:
I think I have a bit more work to do; any help from this point is appreciated.
I think there were a couple of problems here. You need the boxplot to have the variables the other way round. Also, you need to use the category variable that you created in mutate instead of the original variable. Lastly, you don't need the pivot_wider.
Some of the values were also way outside the useful range and may have been wrong (some numbers were -8), so I have trimmed the outliers to make the graph prettier. You'll want to check the original data to see whether this makes sense.
pacman::p_load(RCurl, plm, tibble, ggplot2, AER, dplyr, car, arm, broom, tidyr, fastDummies, dummies)
x <- getURL("https://raw.githubusercontent.com/dothemathonthatone/maps/master/main_test.csv")
maindf <- read.csv(text = x, row.names=NULL)
maindf <- maindf %>%
mutate(category = cut(year_hh_inc,
breaks = (quantile(year_hh_inc, c(0, 1/5, 2/5, 3/5, 4/5, 1), na.rm = TRUE)),
labels = c("first_quint", "second_quint", "third_quint",
'fourth_quint', 'fifth_quint'),
include.lowest = TRUE),
vals = 1)
maindf <- maindf[maindf$fee_per_inc > 0 & maindf$fee_per_inc < 0.01, ]
box <- boxplot(maindf$fee_per_inc ~ maindf$category, col = 3:5)
Created on 2020-03-03 by the reprex package (v0.3.0)
Basically I have a vector names of all names, and a dataframe df with a BIN (0/1) field and a NAME field. For every row with BIN==0, I want to create a duplicate row but with 1 instead and add it to the bottom of df with a different name. Here's what I have to select a new name, given the current name:
sample(names[names!=name], 1)
But I'm not sure how to vectorize this and furthermore add it to df with the same data from BIN.
EDIT:
Sample data:
df = data.frame(BIN=c(1,0,1), NAME=c("alice","bob","cate"))
names = c("alice","bob","cate","dan")
I got closer with something like this:
rbind(df, df %>% filter(BIN == 0) %>%
mutate(NAME = sample(names[names!=NAME],1)))
But I get an error: In binattr(e1, e2): length(e1) not a multiple of length(e2).
Here's a simple approach. I think it's pretty straightforward, let me know if you have questions:
rename = subset(df, BIN == 0)
rename$NEW_NAME = sample(names, size = nrow(rename), replace = TRUE)
while(any(rename$NAME == rename$NEW_NAME)) {
matches = rename$NAME == rename$NEW_NAME
rename$NEW_NAME[matches] = sample(names, size = sum(matches), replace = TRUE)
}
rename$BIN = 1
rename$NAME = rename$NEW_NAME
rename$NEW_NAME = NULL
result = rbind(df, rename)
result
# BIN NAME
# 1 1 alice
# 2 0 bob
# 3 1 cate
# 21 1 alice
Here's another approach, less clear but more efficient. This is the "right" way to do it, but it requires a little bit more thought and explanation.
df$NAME = factor(df$NAME, levels = names)
rename = subset(df, BIN == 0)
n = length(names)
# we will increment each level number with a random integer from
# 1 to n - 1 (with a mod to make it cyclical)
offset = sample(1:(n - 1), size = nrow(rename), replace = TRUE)
adjusted = (as.integer(rename$NAME) + offset) %% n
# reconcile 1-indexed factor levels with 0-indexed mod operator
adjusted[adjusted == 0] = n
rename$NAME = names[adjusted]
rename$BIN = 1
result = rbind(df, rename)
(or, rewritten for dplyr)
df = mutate(df, NAME = factor(NAME, levels = names))
n = length(names)
df %>% filter(BIN == 0) %>%
mutate(
offset = sample(1:(n - 1), size = n(), replace = TRUE),
adjusted = (as.integer(NAME) + offset) %% n,
adjusted = if_else(adjusted == 0, n, adjusted),
NAME = names[adjusted],
BIN = 1
) %>%
select(-offset, -adjusted) %>%
rbind(df, .)
Since your issue is the vectorization part, I'd recommend testing answer on a sample case with more than one BIN 0 row, I used this:
df = data.frame(BIN=c(1,0,1,0,0,0,0,0,0), NAME=rep(c("alice","bob","cate"), 3))
And, because I was curious, here's a benchmark for 10k rows with 26 names. Results first, code below:
# Unit: milliseconds
# expr min lq mean median uq max neval
# while_loop 34.070438 34.327020 37.53357 35.548047 39.922918 46.206454 10
# increment 1.397617 1.458592 1.88796 1.526512 2.123894 3.196104 10
# increment_dplyr 24.002169 24.681960 25.50568 25.374429 25.750548 28.054954 10
# map_char 346.531498 347.732905 361.82468 359.736403 374.648635 383.575265 10
The "clever" way is by far the fastest. My guess is the dplyr slowdown is because we can't do the direct replacement of only the relevant bits of adjusted, and instead have to add the overhead of if_else. That and we are actually adding columns to the data frame for adjusted and offset rather than dealing with vectors. This is enough to make it almost as slow as the while loop approach, which is still 10x faster than the map_chr which has to go one row at a time.
nn = 10000
df = data.frame(
BIN = sample(0:1, size = nn, replace = TRUE, prob = c(0.7, 0.3)),
NAME = factor(sample(letters, size = nn, replace = TRUE), levels = letters)
)
get.new.name <- function(c){
return(sample(names[names!=c],1))
}
microbenchmark::microbenchmark(
while_loop = {
rename = subset(df, BIN == 0)
rename$NEW_NAME = sample(names, size = nrow(rename), replace = TRUE)
while (any(rename$NAME == rename$NEW_NAME)) {
matches = rename$NAME == rename$NEW_NAME
rename$NEW_NAME[matches] = sample(names, size = sum(matches), replace = TRUE)
}
rename$BIN = 1
rename$NAME = rename$NEW_NAME
rename$NEW_NAME = NULL
result = rbind(df, rename)
},
increment = {
rename = subset(df, BIN == 0)
n = length(names)
# we will increment each level number with a random integer from
# 1 to n - 1 (with a mod to make it cyclical)
offset = sample(1:(n - 1), size = nrow(rename), replace = TRUE)
adjusted = (as.integer(rename$NAME) + offset) %% n
# reconcile 1-indexed factor levels with 0-indexed mod operator
adjusted[adjusted == 0] = n
rename$NAME = names[adjusted]
rename$BIN = 1
},
increment_dplyr = {
n = length(names)
df %>% filter(BIN == 0) %>%
mutate(
offset = sample(1:(n - 1), size = n(), replace = TRUE),
adjusted = (as.integer(NAME) + offset) %% n,
adjusted = if_else(adjusted == 0, n, adjusted),
NAME = names[adjusted],
BIN = 1
) %>%
select(-offset,-adjusted)
},
map_char = {
new.df <- df %>% filter(BIN == 0) %>%
mutate(NAME = map_chr(NAME, get.new.name)) %>%
mutate(BIN = 1)
},
times = 10
)
Well I didn't intend to answer my own question but I did find a simpler solution. I think it's better than using rowwise() but I don't know if it's necessarily the most efficient way.
library(tidyverse)
get.new.name <- function(c){
return(sample(names[names!=c],1))
}
new.df <- rbind(df, df %>% filter(BIN == 0) %>%
mutate(NAME = map_chr(NAME, get.new.name)) %>%
mutate(BIN = 1)
map_char ended up being pretty important instead of just map since the latter would return a weird list of lists.
A little weird but I think this should be what you want:
library(tidyverse)
df <- data.frame(BIN=c(1,0,1), NAME=c("alice","bob","cate"), stringsAsFactors = FALSE)
names <- c("alice","bob","cate","dan")
df %>%
mutate(NAME_new = ifelse(BIN == 0, sample(names, n(), replace = TRUE), NA)) %>%
gather(name_type, NAME, NAME:NAME_new, na.rm = TRUE) %>%
mutate(BIN = ifelse(name_type == "NAME_new", 1, BIN)) %>%
select(-name_type)
Output:
BIN NAME
1 1 alice
2 0 bob
3 1 cate
4 1 alice
My dataset has about 20 columns and I would like to create 7 new columns with lagged data for each of the 20 current columns.
For example I have column x, y, and z. I would like to create a columns for xlag1, xlag2, xlag3, xlag4, xlag5, xlag6, xlag7, ylag1, ylag2, etc..
My current attempt is with dplyr in R -
aq %>% mutate(.,
xlag1 = lag(x, 1),
xlag2 = lag(x, 2),
xlag3 = lag(x, 3),
xlag4 = lag(x, 4),
xlag5 = lag(x, 5),
xlag6 = lag(x, 6),
xlag7 = lag(x, 7),
)
As you can see it'll take alot of lines of codes to cover all 20 columns. Is there a more efficient way of doing this ? If possible in dplyr and R as I'm most familiar with the package.
We can use data.table. The shift from data.table can take a sequence of 'n'.
library(data.table)
setDT(aq)[, paste0('xlag', 1:7) := shift(x, 1:7)]
If there are multiple columns,
setDT(aq)[, paste0(rep(c("xlag", "ylag"), each = 7), 1:7) :=
c(shift(x, 1:7), shift(y, 1:7))]
If we have many columns, then specify the columns in .SDcols and loop through the dataset, get the shift, unlist and assign to new columns
setDT(aq)[, paste0(rep(c("xlag", "ylag"), each = 7), 1:7) :=
unlist(lapply(.SD, shift, n = 1:7), recursive = FALSE) , .SDcols = x:y]
We can also use the shift in dplyr
library(dplyr)
aq %>%
do(setNames(data.frame(., shift(.$x, 1:7)), c(names(aq), paste0('xlag', 1:7))))
and for multiple columns
aq %>%
do(setNames(data.frame(., shift(.$x, 1:7), shift(.$y, 1:7)),
c(names(aq), paste0(rep(c("xlag", "ylag"), each = 7), 1:7) )))
data
aq <- data.frame(x = 1:20, y = 21:40)