Aggregate if string contains specific text in R - r

I've seen a lot of posts on this topic so apologies if this is a duplicate but I couldn't figure out my problem.
I have
df <- data.frame(name = c('bike+ride','shoe+store','ride','mountian%20bike','ride+along'),
count = c(2,5,8,7,6))
and want to sum each count if it name contains a string group
group <- data.frame(group = c('ride','bike'))
So the end result looks as follows:
Group Count
bike 9
ride 16
Can anyone help?

A base R idea,
sapply(sapply(as.character(group$group), function(i) grep(i, df$name)), function(i) sum(df$count[i]))
#or make it a function
aggr1 <- function(var1, grp, cnt){
m1 <- sapply(as.character(grp), function(i) grep(i, var1))
final_d <- sapply(m1, function(i) sum(cnt[i]))
return(data.frame(Group = names(final_d),
Count = as.integer(final_d), stringsAsFactors = FALSE)
)
}
aggr1(df$name, group$group, df$count)
# Group Count
#1 ride 16
#2 bike 9

One way is
do.call(rbind, sapply(group$group, FUN = function(x, df) {
out <- df[grepl(pattern = x, x = df$name), ]
data.frame(group = x, count = sum(out$count))
}, df = df, simplify = FALSE))
group count
1 ride 16
2 bike 9
In two steps:
# make a data.frame which locates where each group level is located
grp <- as.data.frame(sapply(group$group, FUN = function(x) grepl(pattern = x, x = df$name)))
names(grp) <- group$group
# based on above location (TRUE/FALSE), sum accordingly
data.frame(count = apply(grp, MARGIN = 2, FUN = function(x, df) {
sum(df[x, "count"])
}, df = df))
count
ride 16
bike 9

A way using tidyverse packages purrr, dplyr and tidyr:
library(tidyverse) # for dplyr, purr and tidyr
groups <- c('ride','bike')
map_df(groups, ~setNames(summarize_(df, interp(~sum(df$count[grepl(var, name)], na.rm = TRUE), var = .x)), .x)) %>%
gather(group, count, na.rm = TRUE)

Related

Extract each data frame name from a list of data frames in lapply function

I have a list of many data frames and I am trying to perform manipulations to each data frame in the list. I created this lapply function and then the list is then merged together. However when trying to rename certain columns so that they include the respective data frame name:
paste(deparse(substitute(x)),"_start"
the dataframe names are being extracted like this :
x[[i]]_start_1
Here is the full code:
df_list <-lapply(df_list, function(x){
lookup <- c(start = paste(deparse(substitute(x)),"_start"),
end = paste(deparse(substitute(x)),"_end"),
top = paste(deparse(substitute(x)),"_top"),
left = paste(deparse(substitute(x)),"_left"),
height = paste(deparse(substitute(x)),"_height"),
width = paste(deparse(substitute(x)),"_width"),
type = paste(deparse(substitute(x)),"_type"),
value = paste(deparse(substitute(x)),"_value"))
x <- x %>% rename_with(.fn = ~lookup[.x], .cols = intersect(names(.), names(lookup)))
x <- arrange(x, creativeId)
x <- x[,-1]
x <- x %>% distinct()
x$counter <- with(x, ave(creativeId, with(rle(creativeId), rep(seq_along(values), lengths)), FUN = seq_along))
x <- x %>% relocate(counter)
x <- x %>% pivot_wider(names_from =counter, values_from= -names(.)[1:2])
})
new_df <- Reduce(function(x,y) merge(x,y,all=TRUE), df_list)
Please let me know if there is a workaround so that the data frame names are printed correctly. Thank you!
We may use Map
df_list2 <- Map(function(x, nm) {
lookup <- c(start = paste0(nm,"_start"),
end = paste0(nm, "_end"),
top = paste0(nm,"_top"),
left = paste0(nm,"_left"),
height = paste0(nm,"_height"),
width = paste0(nm,"_width"),
type = paste0(nm,"_type"),
value = paste0(nm,"_value"))
x <- x %>%
rename_with(.fn = ~lookup[.x], .cols = intersect(names(.), names(lookup)))
x <- arrange(x, creativeId)
x <- x[,-1]
x <- x %>% distinct()
x$counter <- with(x, ave(creativeId,
with(rle(creativeId), rep(seq_along(values), lengths)), FUN = seq_along))
x <- x %>% relocate(counter)
x <- x %>% pivot_wider(names_from =counter, values_from= -names(.)[1:2])
}, df_list, names(df_list))

variable length df subsampling function r

I need to write a function involving subsetting a df by a variable n bins. Like, if n is 2, then subsample the df some number of times in two bins (from the first half, then from the second half). If n is 3, subsample in 3 bins (first 1/3, second 1/3, third 1/3). I've been doing this for different lengths of n manually so far, and I know there must be a better way to do it. I want to write it into a function with n as an input, but I can't make it work so far. Code below.
# create df
df <- data.frame(year = c(1:46),
sample = seq(from=10,to=30,length.out = 46) + rnorm(46,mean=0,sd=2) )
# real df has some NAs, so we'll add some here
df[c(20,32),2] <- NA
this df is 46 years of sampling. I want to pretend instead of 46 samples, I only took 2, but at one random year in the first half (1:23), and one random year in the second half (24:46).
# to subset in 2 groups, say, 200 times
# I'll make a df of elements to sample
samplelist <- data.frame(firstsample = sample(1:(nrow(df)/2),200,replace = T), # first sample in first half of vector
secondsample = sample((nrow(df)/2):nrow(df),200, replace = T) )# second sample in second half of vector
samplelist <- as.matrix(samplelist)
# start a df to add to
plot_df <- df %>% mutate(first='all',
second = 'all',
group='full')
# fill the df using coords from expand.grid
for(i in 1:nrow(samplelist)){
plot_df <<- rbind(plot_df,
df[samplelist[i,] , ] %>%
mutate(
first = samplelist[i,1],
second = samplelist[i,2],
group = i
))
print(i)
}
(If we can make it skip samples on "NA" sample years, that would be extra good).
So, if I wanted to do this for three points instead of two, I'd repeat the process like this:
# to subset in 3 groups 200 times
# I'll make a df of elements to sample
samplelist <- data.frame(firstsample = sample(1:(nrow(df)/3),200,replace = T), # first sample in first 1/3
secondsample = sample(round(nrow(df)/3):round(nrow(df)*(2/3)),200, replace = T), # second sample in second 1/3
thirdsample = sample(round(nrow(df)*(2/3)):nrow(df), 200, replace=T) # third sample in last 1/3
)
samplelist <- as.matrix(samplelist)
# start a df to add to
plot_df <- df %>% mutate(first='all',
second = 'all',
third = 'all',
group='full')
# fill the df using coords from expand.grid
for(i in 1:nrow(samplelist)){
plot_df <<- rbind(plot_df,
df[samplelist[i,] , ] %>%
mutate(
first = samplelist[i,1],
second = samplelist[i,2],
third = samplelist[i,3],
group = i
))
print(i)
}
but, I want to do this many times, sampling up to ~20 times (so in 20 bins), so this manual method is not sustainable. Can you help me write a function to say "pick one sample from n bins x times"?
btw, this is the plot I am making with the complete df:
plot_df %>%
ggplot(aes(x=year,y=sample)) +
geom_point(color="grey40") +
stat_smooth(geom="line",
method = "lm",
alpha=.3,
aes(color=group,
group=group),
se=F,
show.legend = F) +
geom_line(color="grey40") +
geom_smooth(data = plot_df %>% filter(group %in% c("full")),
method = "lm",
alpha=.7,
color="black",
size=2,
#se=F,
# fill="grey40
show.legend = F
) +
theme_classic()
If I got you right, the following function splits your df in n bins, draws x samples from each and puts the results back into cols of a df:
library(tidyverse)
set.seed(42)
df <- data.frame(year = c(1:46),
sample = seq(from=10,to=30,length.out = 46) + rnorm(46,mean=0,sd=2) )
get_df_sample <- function(df, n, x) {
df %>%
# bin df in n bins of (approx.) equal length
mutate(bin = ggplot2::cut_number(seq_len(nrow(.)), n, labels = seq_len(n))) %>%
# split by bin
split(.$bin) %>%
# sample x times from each bin
map(~ .x[sample(seq_len(nrow(.x)), x, replace = TRUE),]) %>%
# keep only column "sample"
map(~ select(.x, sample)) %>%
# Rename: Add number of df-bin from which sample is drawn
imap(~ rename(.x, !!sym(paste0("sample_", .y)) := sample)) %>%
# bind
bind_cols() %>%
# Add group = rownames
rownames_to_column(var = "group")
}
get_df_sample(df, 3, 200) %>%
head()
#> sample_1 sample_2 sample_3 group
#> 1 12.58631 18.27561 24.74263 1
#> 2 19.46218 24.24423 23.44881 2
#> 3 12.92179 18.47367 27.40558 3
#> 4 15.22020 18.47367 26.29243 4
#> 5 12.58631 24.24423 24.43108 5
#> 6 19.46218 23.36464 27.40558 6
Created on 2020-03-24 by the reprex package (v0.3.0)
Here's a function using loops, closer to what you started doing:
df <- data.frame(year = c(1:46),
sample = seq(from=10, to=30, length.out = 46) +
rnorm(46,mean=0,sd=2))
df[c(20,32), 2] <- NA
my_function <- function(n, sample_size, data = df) {
plot_df <- data %>% mutate(group = 'full')
sample_matrix <- matrix(data = NA, nrow = sample_size, ncol = n)
first_row <- 1 # First subset has 1 as first row, no matter how many subsets
for (i in 1:n) {
last_row <- round(first_row + nrow(df)/n - 1) # Determine last row of i-th subset
sample_matrix[, i] <- sample(first_row:last_row, sample_size, replace = T) # Store sample directly in matrix
first_row <- i + last_row # Determine first row for next i
group_name <- paste("group", i, sep = "_") # Column name for i-th group
plot_df[[group_name]] <- "all" # Column for i-th group
}
for (j in 1:sample_size) {
# Creating a new data frame for new observations
new_obs <- df[sample_matrix[j,], ]
new_obs[["group"]] <- j
for (group_n in 1:n) {
new_obs[[paste0("group_", group_n)]] <- sample_matrix[j, group_n]
}
plot_df <- rbind(plot_df, new_obs)
plot_df <<- plot_df
}
}
my_function(2, 200, data = df)

Processing a data frame in r by subgroup: is it possible to get rid of the 'for' loop?

I frequently work with data frames and have to run some sophisticated data wrangling / manipulations by subgroup that is defined in one of the columns. I am aware of dplyr and group_by and know that many things could be solved using group_by. However, often I have to do some pretty intricate calculations and end up just using the 'for' loop.
I was wondering about the existence of some other general approach or paradigm that is faster/more elegant. Maybe map (that I am not very familiar with)?
Below is an example. Notice - it is fake and meaningless. So let's ignore why I need to do those things or the fact that there could be 2 consequtive NAs in a column, etc. That's not the focus of my question. The point is that often I have to operate "within the constraints of a subgroup" and then - inside that subgroup - I have to do operations columnwise, rowwise and sometimes even cellwise.
I also realize that I could probably put most of that code inside a function, split my data frame into a list based on 'group', apply this function to each element of that list and then do.call(rbind...) at the end. But is this the only way?
Thanks a lot for any hints!
library(dplyr)
library(forcats)
set.seed(123)
x <- tibble(group = c(rep('a', 10), rep('b', 10), rep('c', 10)),
attrib = c(sample(c("one", "two", "three", "four"), 10, replace = T),
sample(c("one", "two", "three"), 10, replace = T),
sample(c("one", "three", "four"), 10, replace = T)),
v1 = sample(c(1:5, NA), 30, replace = T),
v2 = sample(c(1:5, NA), 30, replace = T),
v3 = sample(c(1:5, NA), 30, replace = T),
n1 = abs(rnorm(30)), n2 = abs(rnorm(30)), n3 = abs(rnorm(30)))
v_vars = paste0("v", 1:3)
n_vars = paste0("n", 1:3)
results <- NULL # Placeholder for final results
for(i in seq(length(unique(x$group)))) { # loop through groups
mygroup <- unique(x$group)[i]
mysubtable <- x %>% filter(group == mygroup)
# IMPUTE NAs in v columns
# Replace every NA with a mean of values above and below it; and if it's the first or
# the last value, with the mean of 2 values below or above it.
for (v in v_vars){ # loop through v columns
which_nas <- which(is.na(mysubtable[[v]])) # create index of NAs for column v
if (length(which_nas) == 0) next else {
for (na in which_nas) { # loop through indexes of column values that are NAs
if (na == 1) {
mysubtable[[v]][na] <- mean(c(mysubtable[[v]][na + 1],
mysubtable[[v]][na + 2]), na.rm = TRUE)
} else if (na == nrow(mysubtable)) {
mysubtable[[v]][na] <- mean(c(mysubtable[[v]][na - 2],
mysubtable[[v]][na - 1]), na.rm = TRUE)
} else {
mysubtable[[v]][na] <- mean(c(mysubtable[[v]][na - 1],
mysubtable[[v]][na + 1]), na.rm = TRUE)
}
} # end of loop through NA indexes
} # end of else
} # end of loop through v vars
# Aggregate v columns (mean) for each value of column 'attrib'
result1 <- mysubtable %>% group_by(attrib) %>%
summarize_at(v_vars, mean)
# Aggregate n columns (sum) for each value of column 'attrib'
result2 <- mysubtable %>% group_by(attrib) %>%
summarize_at(n_vars, sum)
# final result should contain the name of the group
results[[i]] <- cbind(mygroup, result1, result2[-1])
}
results <- do.call(rbind, results)
Maybe this example is too simple, but in this case, the only thing you need to pull out is the imputation.
my_impute <- function(x) {
which_nas <- which(is.na(x))
for (na in which_nas) {
if (na == 1) {
x[na] <- mean(c(x[na + 1], x[na + 2]), na.rm = TRUE)
} else if (na == length(x)) {
x[na] <- mean(c(x[na - 2], x[na - 1]), na.rm = TRUE)
} else {
x[na] <- mean(c(x[na - 1], x[na + 1]), na.rm = TRUE)
}
}
x
}
Then you just need to group appropriately and impute and summarize.
x2 <- x %>% group_by(group) %>% mutate_at(v_vars, my_impute) %>%
group_by(group, attrib)
full_join(x2 %>% summarize_at(v_vars, mean),
x2 %>% summarize_at(n_vars, sum))
My usual method for things like this, where similar calculations need to be on a bunch of columns, is to put it in long format. Here it feels a little like the long way round, but perhaps this would be useful to see.
x %>% mutate(row=1:n()) %>% gather("variable", "value", c(v_vars, n_vars)) %>%
separate(variable, c("var", "x"), sep=1) %>% spread(var, value) %>%
arrange(group, x, row) %>% group_by(group, x) %>%
mutate(v=my_impute(v)) %>% group_by(group, attrib, x) %>%
summarize(v=mean(v), n=sum(n)) %>%
gather("var", "value", v, n) %>% mutate(X=paste0(var, x)) %>%
select(-x, -var) %>% spread(X, value)
More generally, split-apply-combine is probably the way to go, as you suggest in your question; here's a way using the tidyverse.
doX <- function(x) {
x2 <- x %>% mutate_at(v_vars, my_impute) %>% group_by(attrib)
full_join(x2 %>% summarize_at(v_vars, mean),
x2 %>% summarize_at(n_vars, sum))
}
x %>% group_by(group) %>% nest() %>%
mutate(result=map(data, doX)) %>% select(-data) %>% unnest()
The more traditional method is with do.call, split, and rbind; here I don't make the effort to keep the group information.
do.call(rbind, lapply(split(x, x$group), doX))
The first thing to do is to change your data imputing into a function. I made some simple modifications to have it accept a vector and simplified the call to mean.
fx_na_rm <- function(z) {
which_nas <- which(is.na(z))
if (length(which_nas) > 0) {
for (na in which_nas) { # loop through indexes of column values that are NAs
if (na == 1) {
z[na] <- mean(z[na + (1:2)], na.rm = TRUE)
} else if (na == nrow(mysubtable)) {
z[na] <- mean(z[na - (1:2)], na.rm = TRUE)
} else {
z[na] <- mean(z[c(na - 1, na + 1)], na.rm = TRUE)
}
} # end of loop through NA indexes
}
return(z)
}
I like data.table so here's a solution that uses it. Now since you use different functions for the n and v variable groups, most purrr or any other solutions will also be a little funny.
library(data.table)
dt <- copy(as.data.table(x))
v_vars = paste0("v", 1:3)
n_vars = paste0("n", 1:3)
dt[, (v_vars) := lapply(.SD, as.numeric), .SDcols = v_vars]
dt[, (v_vars) := lapply(.SD, fx_na_rm), by = group, .SDcols = v_vars]
# see https://stackoverflow.com/questions/50626316/r-data-table-apply-function-a-to-some-columns-and-function-b-to-some-others
scols <- list(v_vars, n_vars)
funs <- rep(c(mean, sum), lengths(scols))
dt[, setNames(Map(function(f, x) f(x), funs, .SD), unlist(scols))
, by = .(group,attrib)
, .SDcols = unlist(scols)]
The for loop itself is difficult to vectorize because the results can depend on itself. Here is my attempt which is not an identical output to yours:
# not identical
fx_na_rm2 <- function(z) {
which_nas <- which(is.na(z))
if (length(which_nas) > 0) {
ind <- c(rbind(which_nas - 1 + 2 * (which_nas == 1) + -1 * (which_nas == length(z)),
which_nas + 1 + 1 * (which_nas == 1) + -2 * (which_nas == length(z))))
z[which_nas] <- colMeans(matrix(z[ind], nrow = 2), na.rm = T)
}
return(z)
}

Find max of rows from specific columns and extract column name and corresponding row value from another column

Here is a data structure that I have:
structure(list(UDD_beta = c(1.17136554204268, 0.939587997289016
), UDD_pval = c(0, 0), UDD_R.sq = c(0.749044972637797, 0.516943886705951
), SSX_beta = c(1.05356804780772, 0.927948300464624), SSX_pval = c(0,
0), SSX_R.sq = c(0.60226298037862, 0.629111666509209), SPP_beta = c(0.675765151939885,
0.516425218613404), SPP_pval = c(0, 0), SPP_R.sq = c(0.479849538274406,
0.378266618442121), EEE_beta = c(0.690521022226874, 0.639380962824289
), EEE_pval = c(0, 0), EEE_R.sq = c(0.585610742768951, 0.676073352909597
)), .Names = c("UDD_beta", "UDD_pval", "UDD_R.sq", "SSX_beta",
"SSX_pval", "SSX_R.sq", "SPP_beta", "SPP_pval", "SPP_R.sq",
"EEE_beta", "EEE_pval", "EEE_R.sq"), row.names = c("DDK", "DDL"
), class = "data.frame")
I want to take R.sq columns and for each row find the max and the column name of the max value. Then take corresponding beta. Expected output:
Name Value
DDK UDD 1.17136554204268
DDL EEE 0.690521022226874
Sorry, the second expected value should be 0.639380962824289.
We could use max.col. Subset the columns of interest i.e. columns that have 'R.sq' using the grep, then get the column index of max value with max.col. Use that to get the column names and also the values that correspond to a particular row (row/column indexing)
i1 <- grep("R.sq", names(df1))
i2 <- max.col(df1[i1], "first")
i3 <- grep("beta", names(df1))
res <- data.frame(Names = sub("_.*", "", names(df1)[i1][i2]),
Value = df1[i3][cbind(1:nrow(df1), i2)])
row.names(res) <- row.names(df1)
sub_data <- data[grep("R.sq", colnames(data))]
colnames(sub_data) <- gsub("_R.sq", "", colnames(sub_data))
sub_data$Name <- NA
sub_data$Value <- NA
for (i in 1:nrow(sub_data)){
sub_data$Name[i] <- names(sub_data[i,])[which.max(apply(sub_data[i,], 2, max))]
sub_data$Value[i] <- max(data[grep(paste0(sub_data$Name[i], "_beta"), colnames(data))], na.rm=T)
}
sub_data[c("Name", "Value")]
# Name Value
#DDK UDD 1.171366
#DDL EEE 0.690521
You can use a tidyverse approach via gathering your df to long and filtering both R.sq vars and max value, i.e.
library(tidyverse)
df %>%
rownames_to_column('ID') %>%
gather(var, val, -ID) %>%
filter(grepl('R.sq|beta', var)) %>%
group_by(ID) %>%
mutate(max1=as.integer(val == max(val[grepl('R.sq', var)]))) %>%
group_by(ID, grp = sub('_.*', '', var)) %>%
filter(!all(max1 == 0) & grepl('beta', var)) %>%
ungroup() %>% select(-c(max1, grp))
which gives,
# A tibble: 2 x 3
ID var val
<chr> <chr> <dbl>
1 DDK UDD_beta 1.171366
2 DDL EEE_beta 0.639381
# Need ID for all possible betas and Rsq
ID <- gsub("_R.sq", "", grep("_R.sq$", names(INPUT), value = TRUE))
dummy <- function(x) {
# Find out which Rsq is largest
i <- ID[which.max(x[paste0(ID, "_R.sq")])]
# Extract beta for largest Rsq
data.frame(Name = i, Value = x[paste0(i, "_beta")])
}
do.call("rbind", apply(INPUT, 1, dummy))

randomize observations by groups (blocks)

I have a data frame with I obsevations, and each observation belongs to one of g categories.
set.seed(9782)
I <- 500
g <- 10
library(dplyr)
anon_id <- function(n = 1, length = 12) {
randomString <- c(1:n)
for (i in 1:n)
{
randomString[i] <- paste(sample(c(0:9, letters, LETTERS),
length, replace = TRUE),
collapse = "")
}
return(randomString)
}
df <- data.frame(id = anon_id(n = I, length = 16),
group = sample(1:g, I, T))
I want to randomly assign each observation to one of J "urns", given some vector of probabilities p. That is the probability of being assign to urn J=1 is p[1]. The added complexity is that I want to do this block by block.
If I ignore the blocks, I can do this easily:
J <- 3
p <- c(0.25, 0.5, 0.25)
df1 <- df %>% mutate(urn = sample(x = c(1:J), size = I, replace = T, prob = p))
I thought about this method to do it by "block"
# Block randomization
randomize_block <- function(g) {
df1 <- df %>% filter(group==g)
size <- nrow(df1)
df1 <- df1 %>% mutate(urn = sample(x = c(1:J),
size = size,
replace = T,
prob = p))
return(df1)
}
df2 <- lapply(1:g, randomize_block)
df2 <- data.table::rbindlist(df2)
Is there a better way?
Not sure if this is better, but here is a base R technique with data.frame df, that has group name "group" as well as urn assignments 1:J with assignment probabilities in vector p of length J.
# get urn assignment
urnAssignment <- lapply(unique(df$group),
function(i) sample(1:J, nrow(df[group==i,]), replace =T, prob=p))
# get a list that collects position of observations
obsOrder <- lapply(unique(df$group),
function(i) which(df$group == i))
df$urnAssignment <- unlist(urnAssignment)[unlist(obsOrder)]
randomizr::block_ra does exactly what you want.
library(randomizr)
library(janitor) #just for the tabyl function
block_rand <- as.tibble(randomizr::block_ra(blocks = df$group, conditions = c("urn_1","urn_2","urn_3")))
df2 <- as.tibble(bind_cols(df, block_rand))
df2 %>% janitor::tabyl(group, value)
This does the trick using dplyr:
randomize <- function(data, groups=2, block_id = NULL, p=NULL, seed=9782) {
if(is.null(p)) p <- rep(1/groups, groups)
if(is.null(block_id)){
df1 <- data %>%
mutate(Treatment = sample(x = c(1:groups),
size = n(),
replace = T,
prob = p))
return(df1)
}else{
df1 <- data %>% group_by_(block_id) %>%
mutate(Treatment = sample(x = c(1:groups),
size = n(),
replace = T,
prob = p))
}
}
df1 <- randomize(data = df, groups = J, block_id = "group", p = p, seed = 9782)

Resources