Preserve column name when making function - r

I have a dataframe that looks like this:
ï..Employee_Name EmpID MarriedID MaritalStatusID GenderID EmpStatusID DeptID PerfScoreID FromDiversityJobFairID
1: Adinolfi, Wilson K 10026 0 0 1 1 5 4 0
2: Ait Sidi, Karthikeyan 10084 1 1 1 5 3 3 0
3: Akinkuolie, Sarah 10196 1 1 0 5 5 3 0
4: Alagbe,Trina 10088 1 1 0 1 5 3 0
5: Anderson, Carol 10069 0 2 0 5 5 3 0
6: Anderson, Linda 10002 0 0 0 1 5 4 0
I wrote a count function:
HRdata_factor_count <- function(df, var) {
df %>%
count(df[[var]], sort = T) %>%
rename(Variable = `df[[var]]`) %>%
mutate(Variable = factor(Variable)) %>%
mutate(Variable = fct_reorder(Variable, n ))
}
It outputs "variable" instead of the name of the variable given to the var argument:
Variable n
1: 0 187
2: 1 124
I would like to maintain the name of the variable that I tell the function to count without having to rename it inside the body of the function.

You can try this function :
library(dplyr)
HRdata_factor_count <- function(df, var) {
df %>%
count(.data[[var]], sort = T) %>%
mutate(!!var := factor(.data[[var]]))
}

Related

merging the outputs of functions within nested function in R

I have 2 functions that each gives a different output, i was trying to create a new function that merge the 2 outputs of theses functions, but keep getting an error saying the object is not found, i understand that once i am out of any of my functions (inside of the general function), the main function does not recognize these objects. I do not know how to make these outputs recognizable in the global environment for the main function. Here is the code
#############################################################################
#############################################################################
# 1. datasets
IDr= c(seq(1,5))
BTR=c("A","B","AB","O","O")
data_R=data.frame(IDr,BTR,A=c(0,1,rep(0,3)),B=c(0,rep(0,3),1),C=c(0,rep(1,3),0),D=c(0,rep(1,4)),E=c(1,1,0,rep(1,1),0),stringsAsFactors=FALSE)
data_R
IDr BTR A B C D E
1 1 A 0 0 0 0 1
2 2 B 1 0 1 1 1
3 3 AB 0 0 1 1 0
4 4 O 0 0 1 1 1
5 5 O 0 1 0 1 0
IDd= c(seq(1,8))
BTD= c("A","B","AB","O","AB","AB","O","O")
fg= c(rep(0.0025, each=2),rep(0.00125, each=2),rep(0.0011, each=2),rep(0.0015, each=2))
data_D=data.frame(IDd,BTD,A=c(rep(0,5),1,1,1),B=c(rep(0,6),1,1),C=c(rep(1,7),0),D=rep(1,8),E=c(rep(0,5),rep(1,2),0),fg,stringsAsFactors=FALSE)
data_D
IDd BTD A B C D E fg
1 1 A 0 0 1 1 0 0.00250
2 2 B 0 0 1 1 0 0.00250
3 3 AB 0 0 1 1 0 0.00125
4 4 O 0 0 1 1 0 0.00125
5 5 AB 0 0 1 1 0 0.00110
6 6 AB 1 0 1 1 1 0.00110
7 7 O 1 1 1 1 1 0.00150
8 8 O 1 1 0 1 0 0.00150
############################################################################
############################################################################
# fist function
# calulate the frequency of repeated set (A:E) using fg
freq<- function(df, Vars,col.interest){
col.interest=as.data.frame(col.interest)
resultat1= df %>%
group_by(across(all_of(Vars))) %>%
dplyr::summarise(count = n(), frequency.epi = sum(fg), .groups = 'drop')
res=merge(resultat1,col.interest,all=TRUE)
res_final=cbind(df[1:2],res)
return(res_final)
}
dfreq= freq(data_D,colnames(data_D)[3:7],data_D[3:7])
dfreq
IDd BTD A B C D E count frequency.epi
1 1 A 0 0 1 1 0 5 0.0086
2 2 B 0 0 1 1 0 5 0.0086
3 3 AB 0 0 1 1 0 5 0.0086
4 4 O 0 0 1 1 0 5 0.0086
5 5 AB 0 0 1 1 0 5 0.0086
6 6 AB 1 0 1 1 1 1 0.0011
7 7 O 1 1 0 1 0 1 0.0015
8 8 O 1 1 1 1 1 1 0.0015
###############################################################
# the second function that was corrected by #MrFlic
mis.test = function(D, R, threshold) {
D = as.data.frame(D)
R = as.data.frame(R)
mismatch.i = function(i) {
dif = purrr::map2_df(D[-1], R[i,-1], `-`)
dif[dif<0] = 0
dif$mismatch=rowSums(dif)
dif = cbind(ID = D[1],IDr=R[i,1], dif)
dif = dif[which(dif$mismatch <= threshold),]
return(list=dif[c(1,2,ncol(dif))])
}
diff.mat = do.call(rbind, lapply(1:nrow(R), function(x) mismatch.i(x)))
diff.mat = as.data.frame(diff.mat)
return(diff.mat)
}
# if i want mis.test for 1 person
mis_one=mis.test(data_D[,c(1,3:7)],data_R[1,c(1,3:7)],2)
mis_one
IDd IDr mismatch
1 1 1 2
2 2 1 2
3 3 1 2
4 4 1 2
5 5 1 2
# what i want to do in the main function is this step (for example using these exact outputs)
merge(mis_one,dfreq,by="IDd") # this was executed outside to show the expected output
# this is the output expected that i want if i run the main function
IDd IDr mismatch BTD A B C D E count frequency.epi
1 1 1 2 A 0 0 1 1 0 5 0.0086
2 2 1 2 B 0 0 1 1 0 5 0.0086
3 3 1 2 AB 0 0 1 1 0 5 0.0086
4 4 1 2 O 0 0 1 1 0 5 0.0086
5 5 1 2 AB 0 0 1 1 0 5 0.0086
Here is the main function, with many errors
test.merge=function(D,DF,R,threshold,Vars,col.interest){
R=as.data.frame(R)
D=as.data.frame(D)
DF=as.data.frame(DF)
col.interest=as.data.frame(col.interest)
# remark1: Here i know i repeated the same arguments because i did not know what to set in order to do the calculation
freq.epi<- function( Vars,col.interest){
resultat1= DF %>%
group_by(across(all_of(Vars))) %>%
dplyr::summarise(count = n(), frequency.epi = sum(fg), .groups = 'drop')
res=merge(resultat1,col.interest,all=TRUE)
res_final=cbind(DF[1:2],res)
return(res_final)
}
# same as remark1 for the arguments
mis.test = function(D, R, threshold) {
D = as.data.frame(D)
R = as.data.frame(R)
mismatch.i = function(i) {
dif = purrr::map2_df(D[-1], R[i,-1], `-`)
dif[dif<0] = 0
dif$mismatch=rowSums(dif)
dif = cbind(ID = D[1],IDr=R[i,1], dif)
dif = dif[which(dif$mismatch <= threshold),]
return(list=dif[c(1,2,ncol(dif))])
}
diff.mat = do.call(rbind, lapply(1:nrow(R), function(x) mismatch.i(x)))
diff.mat = as.data.frame(diff.mat)
return(diff.mat)
}
# i dont know how to make diff.mat and res_final visible for test.merge
# i am trying to merge the two outputs res_final and diff.mat by the IDd
tab=merge(diff.mat,res_final,by="IDd")
return(tab)
}
test.merge(data_D[,c(1,3:7)],data_D,data_R[1,c(1,3:7)],2,colnames(data_D)[3:7],data_D[3:7])
# Error in merge(diff.mat, res_final, by = "IDd") :
# object 'diff.mat' not found
I dont know if there is other ways to use the outputs of functions within the main function. Thank you in advance for your help
Why do you want to mix all the functions into one? I would suggest to keep them separate and write test.merge to only merge data from 2 outputs.
freq<- function(df, Vars,col.interest){
col.interest=as.data.frame(col.interest)
resultat1= df %>%
group_by(across(all_of(Vars))) %>%
dplyr::summarise(count = n(), frequency.epi = sum(fg), .groups = 'drop')
res=merge(resultat1,col.interest,all=TRUE)
res_final=cbind(df[1:2],res)
return(res_final)
}
mis.test = function(D, R, threshold) {
D = as.data.frame(D)
R = as.data.frame(R)
mismatch.i = function(i) {
dif = purrr::map2_df(D[-1], R[i,-1], `-`)
dif[dif<0] = 0
dif$mismatch=rowSums(dif)
dif = cbind(ID = D[1],IDr=R[i,1], dif)
dif = dif[which(dif$mismatch <= threshold),]
return(list=dif[c(1,2,ncol(dif))])
}
diff.mat = do.call(rbind, lapply(1:nrow(R), function(x) mismatch.i(x)))
diff.mat = as.data.frame(diff.mat)
return(diff.mat)
}
test.merge = function(x, y) {
merge(x,y,by="IDd")
}
test.merge(mis.test(data_D[,c(1,3:7)],data_R[1,c(1,3:7)],2),
freq(data_D,colnames(data_D)[3:7],data_D[3:7]))
# IDd IDr mismatch BTD A B C D E count frequency.epi
#1 1 1 2 A 0 0 1 1 0 5 0.0086
#2 2 1 2 B 0 0 1 1 0 5 0.0086
#3 3 1 2 AB 0 0 1 1 0 5 0.0086
#4 4 1 2 O 0 0 1 1 0 5 0.0086
#5 5 1 2 AB 0 0 1 1 0 5 0.0086
And here is the fix to your original code.
test.merge=function(D,R,threshold,DF, Vars,col.interest){
R=as.data.frame(R)
D=as.data.frame(D)
DF=as.data.frame(DF)
col.interest=as.data.frame(col.interest)
freq.epi<- function(DF, Vars,col.interest){
resultat1= DF %>%
group_by(across(all_of(Vars))) %>%
dplyr::summarise(count = n(), frequency.epi = sum(fg), .groups = 'drop')
res=merge(resultat1,col.interest,all=TRUE)
res_final=cbind(DF[1:2],res)
return(res_final)
}
# same as remark1 for the arguments
mis.test = function(D, R, threshold) {
D = as.data.frame(D)
R = as.data.frame(R)
mismatch.i = function(i) {
dif = purrr::map2_df(D[-1], R[i,-1], `-`)
dif[dif<0] = 0
dif$mismatch=rowSums(dif)
dif = cbind(ID = D[1],IDr=R[i,1], dif)
dif = dif[which(dif$mismatch <= threshold),]
return(list=dif[c(1,2,ncol(dif))])
}
diff.mat = do.call(rbind, lapply(1:nrow(R), function(x) mismatch.i(x)))
diff.mat = as.data.frame(diff.mat)
return(diff.mat)
}
tab=merge(mis.test(D, R, threshold),freq.epi(DF, Vars, col.interest),by="IDd")
return(tab)
}
test.merge(data_D[,c(1,3:7)],data_R[1,c(1,3:7)],2,data_D, colnames(data_D)[3:7],data_D[3:7])
I am sure this could be optimised and written in a better way (as suggested in 1st part) but since I don't know the bigger picture here I'll leave this to OP.

How to give a score to each dataframe value based on scorecard in another database?

I'm trying to create a scorecard for the values relative to the scorecard (both below).
values <- data.frame(A= c(-200,-150,-100,0,100),
B= c(100,0,-101,-201,-300),
C= c(-400,400,500,-500,250),
D= c(NA,NA,-1000,-1000,-1000),
E= c(1000,1000,1,-1000,-2000))
scorecard <- data.frame(Names = c("A","B","C","D","E"),
"Score5" = c(-100,-200,-300,-400,-500))
values
A B C D E
1 -200 100 -400 -1000 1000
2 -150 0 400 -1000 1000
3 -100 -101 500 -1000 1
4 0 -201 -500 -1000 -1000
5 100 -300 250 -1000 -2000
If A's value < -100 (i.e. scorecard[1,2]), the scorecard dataframe should say 5, otherwise it should say 0. And I'd like to do this for all of A, B, C, D and E in one dataframe. The desired output is:
# A B C D E
#1 5 0 5 NA 0
#2 5 0 0 NA 0
#3 0 0 0 5 0
#4 0 5 5 5 5
#5 0 5 0 5 5
I've tried the following - which required the packaged xts: install.packages("xts") but I didn't quite get there.
pointsfunction <- function(value) {
points <- c()
for(i in names) {
index = which(colnames(value)==i)
data_start <- which(!is.na(value))[1]
points[1:(data_start -1)] <- NA
for(a in (data_start):(length(value))) {
if(value[a] < scorecard[index, 2]) {
points[a] <- -5
} else {
points[a] <- 0
}
}
}
points <- reclass(points, value)
return(points)
}
scorecardpoints <- as.data.frame(lapply(values, pointsfunction))
I got the following error:
Error in if (value[a] < scorecard[index, 2]) { : argument is of
length zero Called from: FUN(X[[i]], ...)
Any ideas?
Using dplyr and tidyr you can try :
library(dplyr)
library(tidyr)
values %>%
mutate(row = row_number()) %>%
pivot_longer(cols = -row, names_to = 'Names') %>%
left_join(scorecard, by = 'Names') %>%
mutate(value = if_else(value < Score5, 5, 0)) %>%
select(-Score5) %>%
pivot_wider(names_from = Names, values_from = value) %>%
select(-row)
Or perhaps a simpler base R option :
mat <- sweep(values, 2, scorecard$Score5[match(names(values), scorecard$Names)], `<`)
values[mat] <- 5
values[!mat] <- 0
values
# A B C D E
#1 5 0 5 NA 0
#2 5 0 0 NA 0
#3 0 0 0 5 0
#4 0 5 5 5 5
#5 0 5 0 5 5
I like Ronak's answer, but here's an apply-family based solution:
sapply(names(values), function(x) values[[x]] < scorecard$Score5[scorecard$Names == x]) * 5
# A B C D E
# [1,] 5 0 5 NA 0
# [2,] 5 0 0 NA 0
# [3,] 0 0 0 5 0
# [4,] 0 5 5 5 5
# [5,] 0 5 0 5 5

can I use string split with dcast in data.table?

Split a string, build columns with unique values, and fill values according to string.
Sample data.table:
library(data.table)
(dt <- data.table(id = as.numeric(1:5),
x = c(NA, "ab.cde", "co.hij.ab", "cox.cde.kl", NA)))
dcast Approach: close but not quite
dcast(dt, id ~ x, value.var = "id")
dt[dcast(dt, id ~ x, value.var = "id"), on=.(id = id)]
dcast buils some columns and fills some values, but it doesn't do what I want.
string split Approach: I can't transpose
dt[, unique(unlist(strsplit(dt$x, ".", fixed = TRUE))) :=
tstrsplit(dt$x, ".", fixed = TRUE)]
the message says that my LHS has 7 columns while my RHS only has 3. So transposing doesn't work. Maybe I can build the columns and fill the values later:
dt[, unique(unlist(strsplit(dt$x, ".", fixed = TRUE))) := character()]
And now i'm getting close but still not there. I need to fill those columns with 1 and 0s according to a match (or something) on dt$x;
id 1 should have a 1 on column: NA
id 2 should have a 1 on columns: ab, and cde
id 3 should have a 1 on columns: co, hij, and ab
id 4 should have a 1 on columns: cox, cde, and kl
id 5 should have a 1 on column: NA
We can use data.table methods i.e. dcast
library(data.table)
dcast(dt[, {x1 <- strsplit(x, "\\."); c(list(unlist(x1)),
.SD[rep(seq_len(.N), lengths(x1))])}], id + x ~ V1, length)
# id x NA ab cde co cox hij kl
#1: 1 <NA> 1 0 0 0 0 0 0
#2: 2 ab.cde 0 1 1 0 0 0 0
#3: 3 co.hij.ab 0 1 0 1 0 1 0
#4: 4 cox.cde.kl 0 0 1 0 1 0 1
#5: 5 <NA> 1 0 0 0 0 0 0
One option using dplyr and tidyr is to split the string on "." and put it into separate rows and then spread it into wide format.
library(dplyr)
library(tidyr)
dt %>%
mutate(x1 = x) %>%
separate_rows(x, sep = "\\.") %>%
mutate(temp = 1) %>%
spread(x, temp, fill = 0)
# id x1 ab cde co cox hij kl <NA>
#1 1 <NA> 0 0 0 0 0 0 1
#2 2 ab.cde 1 1 0 0 0 0 0
#3 3 co.hij.ab 1 0 1 0 1 0 0
#4 4 cox.cde.kl 0 1 0 1 0 1 0
#5 5 <NA> 0 0 0 0 0 0 1

propagate changes down a column

I would like to use dplyr to go through a dataframe row by row, and if A == 0, then set B to the value of B in the previous row, otherwise leave it unchanged. However, I want "the value of B in the previous row" to refer to the previous row during the computation, not before the computation began, because the value may have changed -- in other words, I'd like changes to propagate downwards. For example, with the following data:
dat <- data.frame(A=c(1,0,0,0,1),B=c(0,1,1,1,1))
A B
1 0
0 1
0 1
0 1
1 1
I would like the result of the computation to be:
result <- data.frame(A=c(1,0,0,0,1),B=c(0,0,0,0,1))
A B
1 0
0 0
0 0
0 0
1 1
If I use something like result <- dat %>% mutate(B = ifelse(A==0,lag(B),B) then changes won't propagate downwards: result$B will be equal to c(0,0,1,1,1), not c(0,0,0,0,1).
More generally, how do you use dplyr::mutate to create a column that depends on itself (as it updates during the computation, not a copy of what it was before)?
Seems like you want a "last observation carried forward" approach. The most common R implementation is zoo::na.locf which fills in NA values with the last observation. All we need to do to use it in this case is to first set to NA all the B values that we want to fill in:
mutate(dat,
B = ifelse(A == 0, NA, B),
B = zoo::na.locf(B))
# A B
# 1 1 0
# 2 0 0
# 3 0 0
# 4 0 0
# 5 1 1
As to my comment, do note that the only thing mutate does is add the column to the data frame. We could do it just as well without mutate:
result = dat
result$B = with(result, ifelse(A == 0, NA, B))
result$B = zoo::na.locf(result$B)
Whether you use mutate or [ or $ or any other method to access/add the columns is tangential to the problem.
We could use fill from tidyr after changing the 'B' values to NA that corresponds to 0 in 'A'
library(dplyr)
library(tidyr)
dat %>%
mutate(B = NA^(!A)*B) %>%
fill(B)
# A B
#1 1 0
#2 0 0
#3 0 0
#4 0 0
#5 1 1
NOTE: By default, the .direction (argument in fill) is "down", but it can also take "up" i.e. fill(B, .direction="up")
Here's a solution using grouping, and rleid (Run length encoding id) from data.table. I think it should be faster than the zoo solution, since zoo relies on doing multiple revs and a cumsum. And rleid is blazing fast
Basically, we only want the last value of the previous group, so we create a grouping variable based on the diff vector of the rleid and add that to the rleid if A == 1. Then we group and take the first B-value of the group for every case where A == 0
library(dplyr)
library(data.table)
dat <- data.frame(A=c(1,0,0,0,1),B=c(0,1,1,1,1))
dat <- dat %>%
mutate(grp = data.table::rleid(A),
grp = ifelse(A == 1, grp + c(diff(grp),0),grp)) %>%
group_by(grp) %>%
mutate(B = ifelse(A == 0, B[1],B)) # EDIT: Always carry forward B on A == 0
dat
Source: local data frame [5 x 3]
Groups: grp [2]
A B grp
<dbl> <dbl> <dbl>
1 1 0 2
2 0 0 2
3 0 0 2
4 0 0 2
5 1 1 3
EDIT: Here's an example with a longer dataset so we can really see the behavior: (Also, switched, it should be if all A != 1 not if not all A == 1
set.seed(30)
dat <- data.frame(A=sample(0:1,15,replace = TRUE),
B=sample(0:1,15,replace = TRUE))
> dat
A B
1 0 1
2 0 0
3 0 1
4 0 1
5 0 0
6 0 0
7 1 1
8 0 0
9 1 0
10 0 0
11 0 0
12 0 0
13 1 0
14 1 1
15 0 0
Result:
Source: local data frame [15 x 3]
Groups: grp [5]
A B grp
<int> <int> <dbl>
1 0 1 1
2 0 1 1
3 0 1 1
4 0 1 1
5 0 1 1
6 0 1 1
7 1 1 3
8 0 1 3
9 1 0 5
10 0 0 5
11 0 0 5
12 0 0 5
13 1 0 6
14 1 1 7
15 0 1 7

sum up cells in matrix according to different hierarchical level

I am using R to make a heatmap from binary interactions. The matrix looks as following
9 401 562 68 71 569 700
9 0 1 0 0 0 0 1
401 0 0 1 0 0 na 1
562 0 1 0 1 1 0 1
68 1 1 0 0 0 0 1
71 1 na 0 0 na 0 1
569 1 1 0 1 0 0 0
700 0 0 0 0 0 0 0
Also, I have metadata corresponding to my Ids
compart group family category
9 Ex Prt A Ps
401 Ex Prt A Ps
562 Ex Prt B Rh
68 In Prt C En
71 In Act D Stp
569 In Act D Stp
700 Ex Act E Aqua
I would like to sum cells at different level, ex here according to family. The table looks then like
A B C D E
A 1 1 0 0 1
B 1 0 0 na 1
C 2 0 0 0 1
D 3 0 1 0 0
E 0 0 0 0 0
And also would like to do it at compart level and so on.
I am looking for solutions that would avoid me to do it manually and go for hours of work.
Your best bet is to flatten or "stretch out" the matrix. Try the following
library(magrittr)
library(data.table)
library(reshape2)
## Let IDs be the metadata data.frame
DT_ids <- as.data.table(Ids, keep.rownames=TRUE)
# DT_ids[, rn := as.numeric(rn)]
setkey(DT_ids, rn)
## Let M be the interactions matrix
## Reshape the interactions data into a tall data.table
DT_interactions <- M %>%
as.data.table(keep.rownames=TRUE) %>%
melt(id.vars = "rn", value.name="interaction")
## Clean up the column names
setnames(DT_interactions, c("rn", "variable"), c("rn.rows", "rn.cols"))
## Add in two copies of the meta data
## one for "rows" of M and one for "cols" of M
DT_interactions[, paste0(names(DT_ids), ".rows") := DT_ids[.(rn.rows)]]
DT_interactions[, paste0(names(DT_ids), ".cols") := DT_ids[.(rn.cols)]]
## Set the key of DT_interactions
setkey(DT_interactions, rn.rows, rn.cols)
## NOW TO SUM UP
DT_interactions[, sum(interaction), by=c("family.rows", "family.cols")]
I would wrap that last part in a nice function
sumByMeta <- function(..., na.rm=TRUE) {
byCols_simple <- list(...) %>% unlist
byCols <- byCols_simple %>%
lapply(paste0, c(".rows", ".cols")) %>%
unlist
L <- length(byCols)
formula <- paste( byCols[1:(L/2)], byCols[(L/2 + 1) : L]
, sep=ifelse(L > 2, " + ", "~"), collapse=" ~ ")
DT_interactions[, sum(interaction, na.rm=na.rm), by=byCols] %>%
dcast.data.table(formula=as.formula(formula), value.var="V1") %>%
setnames(old=seq_along(byCols_simple), new=byCols_simple) %>% {.}
}
## EG:
sumByMeta("family")
# family A B C D E
# 1: A 1 1 0 0 2
# 2: B 1 0 1 1 1
# 3: C 2 0 0 0 1
# 4: D 3 0 1 0 1
# 5: E 0 0 0 0 0
## Try running these
sumByMeta("family")
sumByMeta("group")
sumByMeta("family", "group")
sumByMeta("family", "group", "compart")
sumByMeta("family", "compart")

Resources