I have a (new) question related to expss tables. I wrote a very simple UDF (that relies on few expss functions), as follows:
library(expss)
z_indices <- function(x, m_global, std_global, weight=NULL){
if(is.null(weight)) weight = rep(1, length(x))
z <- (w_mean(x, weight)-m_global)/std_global
indices <- 100+(z*100)
return(indices)
}
Reproducible example, based on infert dataset (plus a vector of arbitrary weights):
data(infert)
infert$w <- as.vector(x=rep(2, times=nrow(infert)), mode='numeric')
infert %>%
tab_cells(age, parity) %>%
tab_cols(total(), education, case %nest% list(total(), education)) %>%
tab_weight(w) %>%
tab_stat_valid_n(label="N") %>%
tab_stat_mean(label="Mean") %>%
tab_stat_fun(label="Z", function(x, m_global, std_global, weight=NULL){
z_indices(x, m_global=w_mean(infert$age, infert$w),std_global=w_sd(infert$age, infert$w))
}) %>%
tab_pivot(stat_position="inside_columns")
The table is computed and the output for the first line is (almost) as expected.
Then things go messy for the second line, since both arguments of z_indices explicitely refer to infert$age, where infert$parity is expected.
My question: is there a way to dynamically pass the variables of tab_cells as function argument within tab_stat_fun to match the variable being processed? I guess this happens inside function declaration but have not clue how to proceed...
Thanks!
EDIT April 28th 2020:
Answer from #Gregory Demin works great in the scope of infert dataset, although for better scalability to larger dataframes I wrote the following loop:
var_df <- data.frame("age"=infert$age, "parity"=infert$parity)
tabZ=infert
for(each in names(var_df)){
tabZ = tabZ %>%
tab_cells(var_df[each]) %>%
tab_cols(total(), education) %>%
tab_weight(w) %>%
tab_stat_valid_n(label="N") %>%
tab_stat_mean(label="Mean") %>%
tab_stat_fun(label="Z", function(x, m_global, std_global, weight=NULL){
z_indices(x, m_global=w_mean(var_df[each], infert$w),std_global=w_sd(var_df[each], infert$w))
})
}
tabZ = tabZ %>% tab_pivot()
Hope this inspires other expss users in the future!
There is no universal solution for this case. Function in the tab_stat_fun is always calculated inside cell so you can't get global values in it.
However, in your case we can calculate z-index before summarizing. Not so flexible solution but it works:
# function for weighted z-score
w_z_index = function(x, weight = NULL){
if(is.null(weight)) weight = rep(1, length(x))
z <- (x - w_mean(x, weight))/w_sd(x, weight)
indices <- 100+(z*100)
return(indices)
}
data(infert)
infert$w <- rep(2, times=nrow(infert))
infert %>%
tab_cells(age, parity) %>%
tab_cols(total(), education, case %nest% list(total(), education)) %>%
tab_weight(w) %>%
tab_stat_valid_n(label="N") %>%
tab_stat_mean(label="Mean") %>%
# here we get z-index instead of original variables
tab_cells(age = w_z_index(age, w), parity = w_z_index(parity, w)) %>%
tab_stat_mean(label="Z") %>%
tab_pivot(stat_position="inside_columns")
UPDATE.
A little more scalable approach:
w_z_index = function(x, weight = NULL){
if(is.null(weight)) weight = rep(1, length(x))
z <- (x - w_mean(x, weight))/w_sd(x, weight)
indices <- 100+(z*100)
return(indices)
}
w_z_index_df = function(df, weight = NULL){
df[] = lapply(df, w_z_index, weight = weight)
df
}
data(infert)
infert$w <- rep(2, times=nrow(infert))
infert %>%
tab_cells(age, parity) %>%
tab_cols(total(), education, case %nest% list(total(), education)) %>%
tab_weight(w) %>%
tab_stat_valid_n(label="N") %>%
tab_stat_mean(label="Mean") %>%
# here we get z-index instead of original variables
# we process a lot of variables at once
tab_cells(w_z_index_df(data.frame(age, parity))) %>%
tab_stat_mean(label="Z") %>%
tab_pivot(stat_position="inside_columns")
Related
I am trying to create an index of a set of variables by taken the mean of the selected variables using the following code:
data <- data %>%
group_by(country) %>%
# Standardize each component/measure
mutate(
std_var1 = standardize(var1, Z),
std_var2 = standardize(var2, Z),
std_var3 = standardize(var3, Z),
std_var4 = standardize(var4, Z)
) %>%
ungroup() %>%
dplyr::select(std_var1,
std_var2,
std_var3,
std_var4) %>%
# Average all z scores for an individual
mutate(index = pmap_dbl(., ~ mean(c(...), na.rm = T))) %>%
cbind(data, .) %>% unnest() %>%
I also use the idx_mean package that takes the following syntax:
mutate(data, idx_var = idx_mean(std_var1, std_var2, std_var3, std_var4))
and obtain similar but not exactly the same index values (not just a matter of rounding).
Is there one approach that seems more accurate here?
The 4th and 5th columns display index values created by the idx function (4th column) and the other approach (5th column.)
So, I recently building a music recommender system using Collaborative Filtering in Rstudio. I have some problem with the function of cosine similarity which the system said "subscript out of bond" on the matrix that I want to calculate.
I use Cosine Similarity which I got the reference from this website: https://bgstieber.github.io/post/recommending-songs-using-cosine-similarity-in-r/
I've tried to fix the script but still apparently the output isn't working.
##cosinesim-crossprod
cosine_sim <- function(a,b) {crossprod(a,b)/sqrt(crossprod(a)*crossprod(b))}
##User data
play_data <- "https://static.turi.com/datasets/millionsong/10000.txt" %>%
read_tsv(col_names = c('user', 'song_id', 'plays'))
##Song data
song_data <- read_csv("D:/3rd Term/DataAnalysis/dataSet/song_data.csv") %>%
distinct(song_id, title, artist_name)
##Grouped
all_data <- play_data %>%
group_by(user, song_id) %>%
summarise(plays = sum(plays, na.rm = TRUE)) %>%
inner_join(song_data)
top_1k_songs <- all_data %>%
group_by(song_id, title, artist_name) %>%
summarise(sum_plays = sum(plays)) %>%
ungroup() %>%
top_n(1000, sum_plays) %>%
distinct(song_id)
all_data_top_1k <- all_data %>%
inner_join(top_1k_songs)
top_1k_wide <- all_data_top_1k %>%
ungroup() %>%
distinct(user, song_id, plays) %>%
spread(song_id, plays, fill = 0)
ratings <- as.matrix(top_1k_wide[,-1])
##Function
calc_cos_sim <- function(song_code = top_1k_songs,
rating_mat = ratings,
songs = song_data,
return_n = 5) {
song_col_index <- which(colnames(ratings)== song_code) %>%
cos_sims <- apply(rating_mat, 2,FUN = function(y)
cosine_sim(rating_mat[,song_col_index], y))
##output
data_frame(song_id = names(cos_sims), cos_sim = cos_sims) %>%
filter(song_id != song_code) %>% # remove self reference
inner_join(songs) %>%
arrange(desc(cos_sim)) %>%
top_n(return_n, cos_sim) %>%
select(song_id, title, artist_name, cos_sim)
}
I expect when I use this script:
shots <- 'SOJYBJZ12AB01801D0'
knitr::kable(calc_cos_sim(shots))
The output would be a data frame of 5 songs.
The pipe at the end of this line looks like a typo:
song_col_index <- which(colnames(ratings)== song_code) %>%
Replace it with:
song_col_index <- which(colnames(ratings)== song_code)
I have the follwing code that takes a dataframe called dft1 and then produces a resulting dataframe called dfb1. I want to repeat the same code for multiple input dataframes such as dft1, dft2 all indexed by a number towards the end and then store the results using the same pattern i.e. dfb1, dfb2, ....
I have tried many methods such as using dapply or for loops but given the nature of the code inside I wasn't able to get the intended results.
#define the function for rolling
window <- 24
rolling_lm <-
rollify(.f = function(R_excess, MKT_RF, SMB, HML) {
lm(R_excess ~ MKT_RF + SMB + HML)
}, window = window, unlist = FALSE)
#rolling over the variable
dfb1 <-
dft1 %>%
mutate(rolling_ff =
rolling_lm(R_excess,
MKT_RF,
SMB,
HML)) %>%
mutate(tidied = map(rolling_ff,
tidy,
conf.int = T)) %>%
unnest(tidied) %>%
slice(-1:-23) %>%
select(date, term, estimate, conf.low, conf.high) %>%
filter(term != "(Intercept)") %>%
rename(beta = estimate, factor = term) %>%
group_by(factor)
Add the command you want to apply to each dataframe in a function
apply_fun <- function(df) {
df %>%
mutate(rolling_ff =
rolling_lm(R_excess,
MKT_RF,
SMB,
HML)) %>%
mutate(tidied = map(rolling_ff,
tidy,
conf.int = T)) %>%
unnest(tidied) %>%
slice(-1:-23) %>%
select(date, term, estimate, conf.low, conf.high) %>%
filter(term != "(Intercept)") %>%
rename(beta = estimate, factor = term) %>%
group_by(factor)
}
Now apply the function to each dataframe and store the results in a list
n <- 10
out <- setNames(lapply(mget(paste0("dft", 1:n)), apply_fun), paste0("dfb", 1:n))
Assuming you have input dataframes like dft1, dft2...this will output a list of dataframes which you can now access doing out[['dfb1']], out[['dfb2']] and so on. Change the value of n based on number of dft dataframes you have.
If the data is already present in a list we can avoid mget by doing
setNames(lapply(result, apply_fun), paste0("dfb", 1:n))
I'm trying to create a function that essentially gets me the MODE...or MODE-X (2nd-Xth most common value & and the associated counts for each column in a data frame.
I can't figure out what I may be missing and I'm looking for some assistance? I believe it has to do with the passing in of a variable into dplyr function.
library(tidyverse)
myfunct_get_mode = function(x, rank=1){
mytable = dplyr::count(rlang::sym(x), sort = TRUE)
names(mytable)= c('variable','counts')
# return just the rank specified...such as mode or mode -1, etc
result = table %>% dplyr::slice(rlang::sym(rank))
return(result)
}
mtcars %>% lapply(. %>% (function(x) myfunct_get_mode(x, rank=2)))
There are some problems with your function:
You function-call is not doing what you think. Check with mtcars %>% lapply(. %>% (function(x) print(x))) that actually your x is the whole column of mtcars. To get the names of the column apply the function to names(mtcars). But then you also have to specify the dataframe you're working on.
To evaluate a symbol you get sym from you need to use !! in front of the rlang::sym(x).
rank is not a variable name, thus no need for rlang::sym here.
table should be mytable in second to last line of your function.
So how could it work (although there are probably better ways):
myfunct_get_mode = function(df, x, rank=1){
mytable = count(df, !!rlang::sym(x), sort = TRUE)
names(mytable)= c('variable','counts')
# return just the rank specified...such as mode or mode -1, etc
result = mytable %>% slice(rank)
return(result)
}
names(mtcars) %>% lapply(function(x) myfunct_get_mode(mtcars, x, rank=2))
If we need this in a list, we can use map
f1 <- function(dat, rank = 1) {
purrr::imap(dat, ~
dat %>%
count(!! rlang::sym(.y)) %>%
rename_all(~ c('variable', 'counts')) %>%
arrange(desc(counts)) %>%
slice(seq_len(rank))) #%>%
#bind_cols - convert to a data.frame
}
f1(mtcars, 2)
I would like to be able to use more automation when creating SpatialLines objects from otherwise tidy data frames.
library(sp)
#create sample data
sample_data <- data.frame(group_id = rep(c("a", "b","c"), 10),
x = rnorm(10),
y = rnorm(10))
#How can I recreate this using dplyr?
a_list <- Lines(list(Line(sample_data %>% filter(group_id == "a") %>% select(x, y))), ID = 1)
b_list <- Lines(Line(list(sample_data %>% filter(group_id == "b") %>% select(x, y))), ID = 2)
c_list <- Lines(Line(list(sample_data %>% filter(group_id == "c") %>% select(x, y))), ID = 3)
SpatialLines(list(a_list, b_list, c_list))
You can see how using something like group_by would make the process pretty easy if you could understand how the data could be piped into a list.
Using your sample data, a wrapper function, and dplyr::do will give you what you want :)
wrapper <- function(df) {
df %>% select(x,y) %>% as.data.frame %>% Line %>% list %>% return
}
y <- sample_data %>% group_by(group_id) %>%
do(res = wrapper(.))
# and now assign IDs (since we can't do that inside dplyr easily)
ids = 1:dim(y)[1]
SpatialLines(
mapply(x = y$res, ids = ids, FUN = function(x,ids) {Lines(x,ID=ids)})
)
I don't use sp so there might be a better way to assign IDs.
For reference, consider reading Hadley's comments on returning non-dataframe from dplyr do calls