merging the outputs of functions within nested function in R - r

I have 2 functions that each gives a different output, i was trying to create a new function that merge the 2 outputs of theses functions, but keep getting an error saying the object is not found, i understand that once i am out of any of my functions (inside of the general function), the main function does not recognize these objects. I do not know how to make these outputs recognizable in the global environment for the main function. Here is the code
#############################################################################
#############################################################################
# 1. datasets
IDr= c(seq(1,5))
BTR=c("A","B","AB","O","O")
data_R=data.frame(IDr,BTR,A=c(0,1,rep(0,3)),B=c(0,rep(0,3),1),C=c(0,rep(1,3),0),D=c(0,rep(1,4)),E=c(1,1,0,rep(1,1),0),stringsAsFactors=FALSE)
data_R
IDr BTR A B C D E
1 1 A 0 0 0 0 1
2 2 B 1 0 1 1 1
3 3 AB 0 0 1 1 0
4 4 O 0 0 1 1 1
5 5 O 0 1 0 1 0
IDd= c(seq(1,8))
BTD= c("A","B","AB","O","AB","AB","O","O")
fg= c(rep(0.0025, each=2),rep(0.00125, each=2),rep(0.0011, each=2),rep(0.0015, each=2))
data_D=data.frame(IDd,BTD,A=c(rep(0,5),1,1,1),B=c(rep(0,6),1,1),C=c(rep(1,7),0),D=rep(1,8),E=c(rep(0,5),rep(1,2),0),fg,stringsAsFactors=FALSE)
data_D
IDd BTD A B C D E fg
1 1 A 0 0 1 1 0 0.00250
2 2 B 0 0 1 1 0 0.00250
3 3 AB 0 0 1 1 0 0.00125
4 4 O 0 0 1 1 0 0.00125
5 5 AB 0 0 1 1 0 0.00110
6 6 AB 1 0 1 1 1 0.00110
7 7 O 1 1 1 1 1 0.00150
8 8 O 1 1 0 1 0 0.00150
############################################################################
############################################################################
# fist function
# calulate the frequency of repeated set (A:E) using fg
freq<- function(df, Vars,col.interest){
col.interest=as.data.frame(col.interest)
resultat1= df %>%
group_by(across(all_of(Vars))) %>%
dplyr::summarise(count = n(), frequency.epi = sum(fg), .groups = 'drop')
res=merge(resultat1,col.interest,all=TRUE)
res_final=cbind(df[1:2],res)
return(res_final)
}
dfreq= freq(data_D,colnames(data_D)[3:7],data_D[3:7])
dfreq
IDd BTD A B C D E count frequency.epi
1 1 A 0 0 1 1 0 5 0.0086
2 2 B 0 0 1 1 0 5 0.0086
3 3 AB 0 0 1 1 0 5 0.0086
4 4 O 0 0 1 1 0 5 0.0086
5 5 AB 0 0 1 1 0 5 0.0086
6 6 AB 1 0 1 1 1 1 0.0011
7 7 O 1 1 0 1 0 1 0.0015
8 8 O 1 1 1 1 1 1 0.0015
###############################################################
# the second function that was corrected by #MrFlic
mis.test = function(D, R, threshold) {
D = as.data.frame(D)
R = as.data.frame(R)
mismatch.i = function(i) {
dif = purrr::map2_df(D[-1], R[i,-1], `-`)
dif[dif<0] = 0
dif$mismatch=rowSums(dif)
dif = cbind(ID = D[1],IDr=R[i,1], dif)
dif = dif[which(dif$mismatch <= threshold),]
return(list=dif[c(1,2,ncol(dif))])
}
diff.mat = do.call(rbind, lapply(1:nrow(R), function(x) mismatch.i(x)))
diff.mat = as.data.frame(diff.mat)
return(diff.mat)
}
# if i want mis.test for 1 person
mis_one=mis.test(data_D[,c(1,3:7)],data_R[1,c(1,3:7)],2)
mis_one
IDd IDr mismatch
1 1 1 2
2 2 1 2
3 3 1 2
4 4 1 2
5 5 1 2
# what i want to do in the main function is this step (for example using these exact outputs)
merge(mis_one,dfreq,by="IDd") # this was executed outside to show the expected output
# this is the output expected that i want if i run the main function
IDd IDr mismatch BTD A B C D E count frequency.epi
1 1 1 2 A 0 0 1 1 0 5 0.0086
2 2 1 2 B 0 0 1 1 0 5 0.0086
3 3 1 2 AB 0 0 1 1 0 5 0.0086
4 4 1 2 O 0 0 1 1 0 5 0.0086
5 5 1 2 AB 0 0 1 1 0 5 0.0086
Here is the main function, with many errors
test.merge=function(D,DF,R,threshold,Vars,col.interest){
R=as.data.frame(R)
D=as.data.frame(D)
DF=as.data.frame(DF)
col.interest=as.data.frame(col.interest)
# remark1: Here i know i repeated the same arguments because i did not know what to set in order to do the calculation
freq.epi<- function( Vars,col.interest){
resultat1= DF %>%
group_by(across(all_of(Vars))) %>%
dplyr::summarise(count = n(), frequency.epi = sum(fg), .groups = 'drop')
res=merge(resultat1,col.interest,all=TRUE)
res_final=cbind(DF[1:2],res)
return(res_final)
}
# same as remark1 for the arguments
mis.test = function(D, R, threshold) {
D = as.data.frame(D)
R = as.data.frame(R)
mismatch.i = function(i) {
dif = purrr::map2_df(D[-1], R[i,-1], `-`)
dif[dif<0] = 0
dif$mismatch=rowSums(dif)
dif = cbind(ID = D[1],IDr=R[i,1], dif)
dif = dif[which(dif$mismatch <= threshold),]
return(list=dif[c(1,2,ncol(dif))])
}
diff.mat = do.call(rbind, lapply(1:nrow(R), function(x) mismatch.i(x)))
diff.mat = as.data.frame(diff.mat)
return(diff.mat)
}
# i dont know how to make diff.mat and res_final visible for test.merge
# i am trying to merge the two outputs res_final and diff.mat by the IDd
tab=merge(diff.mat,res_final,by="IDd")
return(tab)
}
test.merge(data_D[,c(1,3:7)],data_D,data_R[1,c(1,3:7)],2,colnames(data_D)[3:7],data_D[3:7])
# Error in merge(diff.mat, res_final, by = "IDd") :
# object 'diff.mat' not found
I dont know if there is other ways to use the outputs of functions within the main function. Thank you in advance for your help

Why do you want to mix all the functions into one? I would suggest to keep them separate and write test.merge to only merge data from 2 outputs.
freq<- function(df, Vars,col.interest){
col.interest=as.data.frame(col.interest)
resultat1= df %>%
group_by(across(all_of(Vars))) %>%
dplyr::summarise(count = n(), frequency.epi = sum(fg), .groups = 'drop')
res=merge(resultat1,col.interest,all=TRUE)
res_final=cbind(df[1:2],res)
return(res_final)
}
mis.test = function(D, R, threshold) {
D = as.data.frame(D)
R = as.data.frame(R)
mismatch.i = function(i) {
dif = purrr::map2_df(D[-1], R[i,-1], `-`)
dif[dif<0] = 0
dif$mismatch=rowSums(dif)
dif = cbind(ID = D[1],IDr=R[i,1], dif)
dif = dif[which(dif$mismatch <= threshold),]
return(list=dif[c(1,2,ncol(dif))])
}
diff.mat = do.call(rbind, lapply(1:nrow(R), function(x) mismatch.i(x)))
diff.mat = as.data.frame(diff.mat)
return(diff.mat)
}
test.merge = function(x, y) {
merge(x,y,by="IDd")
}
test.merge(mis.test(data_D[,c(1,3:7)],data_R[1,c(1,3:7)],2),
freq(data_D,colnames(data_D)[3:7],data_D[3:7]))
# IDd IDr mismatch BTD A B C D E count frequency.epi
#1 1 1 2 A 0 0 1 1 0 5 0.0086
#2 2 1 2 B 0 0 1 1 0 5 0.0086
#3 3 1 2 AB 0 0 1 1 0 5 0.0086
#4 4 1 2 O 0 0 1 1 0 5 0.0086
#5 5 1 2 AB 0 0 1 1 0 5 0.0086
And here is the fix to your original code.
test.merge=function(D,R,threshold,DF, Vars,col.interest){
R=as.data.frame(R)
D=as.data.frame(D)
DF=as.data.frame(DF)
col.interest=as.data.frame(col.interest)
freq.epi<- function(DF, Vars,col.interest){
resultat1= DF %>%
group_by(across(all_of(Vars))) %>%
dplyr::summarise(count = n(), frequency.epi = sum(fg), .groups = 'drop')
res=merge(resultat1,col.interest,all=TRUE)
res_final=cbind(DF[1:2],res)
return(res_final)
}
# same as remark1 for the arguments
mis.test = function(D, R, threshold) {
D = as.data.frame(D)
R = as.data.frame(R)
mismatch.i = function(i) {
dif = purrr::map2_df(D[-1], R[i,-1], `-`)
dif[dif<0] = 0
dif$mismatch=rowSums(dif)
dif = cbind(ID = D[1],IDr=R[i,1], dif)
dif = dif[which(dif$mismatch <= threshold),]
return(list=dif[c(1,2,ncol(dif))])
}
diff.mat = do.call(rbind, lapply(1:nrow(R), function(x) mismatch.i(x)))
diff.mat = as.data.frame(diff.mat)
return(diff.mat)
}
tab=merge(mis.test(D, R, threshold),freq.epi(DF, Vars, col.interest),by="IDd")
return(tab)
}
test.merge(data_D[,c(1,3:7)],data_R[1,c(1,3:7)],2,data_D, colnames(data_D)[3:7],data_D[3:7])
I am sure this could be optimised and written in a better way (as suggested in 1st part) but since I don't know the bigger picture here I'll leave this to OP.

Related

Preserve column name when making function

I have a dataframe that looks like this:
ï..Employee_Name EmpID MarriedID MaritalStatusID GenderID EmpStatusID DeptID PerfScoreID FromDiversityJobFairID
1: Adinolfi, Wilson K 10026 0 0 1 1 5 4 0
2: Ait Sidi, Karthikeyan 10084 1 1 1 5 3 3 0
3: Akinkuolie, Sarah 10196 1 1 0 5 5 3 0
4: Alagbe,Trina 10088 1 1 0 1 5 3 0
5: Anderson, Carol 10069 0 2 0 5 5 3 0
6: Anderson, Linda 10002 0 0 0 1 5 4 0
I wrote a count function:
HRdata_factor_count <- function(df, var) {
df %>%
count(df[[var]], sort = T) %>%
rename(Variable = `df[[var]]`) %>%
mutate(Variable = factor(Variable)) %>%
mutate(Variable = fct_reorder(Variable, n ))
}
It outputs "variable" instead of the name of the variable given to the var argument:
Variable n
1: 0 187
2: 1 124
I would like to maintain the name of the variable that I tell the function to count without having to rename it inside the body of the function.
You can try this function :
library(dplyr)
HRdata_factor_count <- function(df, var) {
df %>%
count(.data[[var]], sort = T) %>%
mutate(!!var := factor(.data[[var]]))
}

Calculate number of time streak of categories change in a row in R

I have the following data frame in R:
Row number A B C D E F G H I J
1 1 1 0 0 1 0 0 1 1
2 1 0 0 0 1 0 0 1
3 1 0 0 0 1 0 0 1 1
I am trying to calculate the number of times the number changes between 1 and 0 excluding the Nulls
The result I am expecting is this
Row Number No of changes
---------- --------------
1 4
2 4
3 4
An explanation for row 1
In row 1, A has a null so we exclude that.
B and C have 1 which is our first set of values.
D and E have 0 which is our second set of values. Now Change = 1
F has our third set of values which is 1. Now Change = 1+1
G and H have 0 which is our third set of values. Now Change = 1+1+1
I and J have 1 which is our fourth set of values. Now Change = 1+1+1+1 =4
Here's a tidyverse approach.
I gather into longer format (from tidyr::pivot_longer), then add a helper column noting when we have a change from 0 to 1 or from 1 to 0, and then sum those by row.
library(tidyverse)
df %>%
# before tidyr 1.0, this would be gather(col, value, -1)
pivot_longer(-1, "col") %>%
group_by(Row.number) %>%
mutate(chg = value == 1 & lag(value) == 0 |
value == 0 & lag(value) == 1) %>%
summarize(no_chgs = sum(chg, na.rm = T))
# A tibble: 3 x 2
Row.number no_chgs
<int> <int>
1 1 4
2 2 4
3 3 4
Sample data:
df <- read.table(
header = T,
stringsAsFactors = F,
text = "'Row number' A B C D E F G H I J
1 NA 1 1 0 0 1 0 0 1 1
2 NA NA 1 0 0 0 1 0 0 1
3 NA 1 0 0 0 1 0 0 1 1")
Here's a data.table solution:
library(data.table)
dt <- as.data.table(df)
dt[,
no_change := max(rleid(na.omit(t(.SD)))) - 1,
by = RowNumber
]
dt
Alternatively, here's a base version:
apply(df[, -1],
1,
function(x) {
complete_case = complete.cases(x)
if (sum(complete_case) > 0) {
return(length(rle(x[complete_case])$lengths) - 1)
} else {
return (0)
}
}
)

Identifying Duplicates in `data.frame` Using `dplyr`

I want to identify (not eliminate) duplicates in a data frame and add 0/1 variable accordingly (wether a row is a duplicate or not), using the R dplyr package.
Example:
| A B C D
1 | 1 0 1 1
2 | 1 0 1 1
3 | 0 1 1 1
4 | 0 1 1 1
5 | 1 1 1 1
Clearly, row 1 and 2 are duplicates, so I want to create a new variable (with mutate?), say E, that is equal to 1 in row 1,2,3 and 4 since row 3 and 4 are also identical.
Moreover, I want to add another variable, F, that is equal to 1 if there is a duplicate differing only by one column. That is, F in row 1,2 and 5 would be equal to 1 since they only differ in the B column.
I hope it is clear what I want to do and I hope that dplyr offers a smooth solution to this problem. This is of course possible in "base" R but I believe (hope) that there exists a smoother solution.
You can use dist() to compute the differences, and then a search in the resulting distance object can give the needed answers (E, F, etc.). Here is an example code, where X is the original data.frame:
W=as.matrix(dist(X, method="manhattan"))
X$E = as.integer(sapply(1:ncol(W), function(i,D){any(W[-i,i]==D)}, D=0))
X$F = as.integer(sapply(1:ncol(W), function(i,D){any(W[-i,i]==D)}, D=1))
Just change D= for the number of different columns needed.
It's all base R though. Using plyr::laply instead of sappy has same effect. dplyr looks overkill here.
Here is a data.table solution that is extendable to an arbitrary case (1..n columns the same)- not sure if someone can convert to dpylr for you. I had to change your dataset a bit to show your desired F column - in your example all rows would get a 1 because 3 and 4 are one column different from 5 as well.
library(data.table)
DT <- data.frame(A = c(1,1,0,0,1), B = c(0,0,1,1,1), C = c(1,1,1,1,1), D = c(1,1,1,1,1), E = c(1,1,0,0,0))
DT
A B C D E
1 1 0 1 1 1
2 1 0 1 1 1
3 0 1 1 1 0
4 0 1 1 1 0
5 1 1 1 1 0
setDT(DT)
DT_ncols <- length(DT)
base <- data.table(t(combn(1:nrow(DT), 2)))
setnames(base, c("V1","V2"),c("ind_x","ind_y"))
DT[, ind := .I)]
DT_melt <- melt(DT, id.var = "ind", variable.name = "column")
base <- merge(base, DT_melt, by.x = "ind_x", by.y = "ind", allow.cartesian = TRUE)
base <- merge(base, DT_melt, by.x = c("ind_y", "column"), by.y = c("ind", "column"))
base <- base[, .(common_cols = sum(value.x == value.y)), by = .(ind_x, ind_y)]
This gives us a data.frame that looks like this:
base
ind_x ind_y common_cols
1: 1 2 5
2: 1 3 2
3: 2 3 2
4: 1 4 2
5: 2 4 2
6: 3 4 5
7: 1 5 3
8: 2 5 3
9: 3 5 4
10: 4 5 4
This says that rows 1 and 2 have 5 common columns (duplicates). Rows 3 and 5 have 4 common columns, and 4 and 5 have 4 common columns. We can now use a fairly extendable format to flag any combination we want:
base <- melt(base, id.vars = "common_cols")
# Unique - common_cols == DT_ncols
DT[, F := ifelse(ind %in% unique(base[common_cols == DT_ncols, value]), 1, 0)]
# Same save 1 - common_cols == DT_ncols - 1
DT[, G := ifelse(ind %in% unique(base[common_cols == DT_ncols - 1, value]), 1, 0)]
# Same save 2 - common_cols == DT_ncols - 2
DT[, H := ifelse(ind %in% unique(base[common_cols == DT_ncols - 2, value]), 1, 0)]
This gives:
A B C D E ind F G H
1: 1 0 1 1 1 1 1 0 1
2: 1 0 1 1 1 2 1 0 1
3: 0 1 1 1 0 3 1 1 0
4: 0 1 1 1 0 4 1 1 0
5: 1 1 1 1 0 5 0 1 1
Instead of manually selecting, you can append all combinations like so:
# run after base <- melt(base, id.vars = "common_cols")
base <- unique(base[,.(ind = value, common_cols)])
base[, common_cols := factor(common_cols, 1:DT_ncols)]
merge(DT, dcast(base, ind ~ common_cols, fun.aggregate = length, drop = FALSE), by = "ind")
ind A B C D E 1 2 3 4 5
1: 1 1 0 1 1 1 0 1 1 0 1
2: 2 1 0 1 1 1 0 1 1 0 1
3: 3 0 1 1 1 0 0 1 0 1 1
4: 4 0 1 1 1 0 0 1 0 1 1
5: 5 1 1 1 1 0 0 0 1 1 0
Here is a dplyr solution:
test%>%mutate(flag = (A==lag(A)&
B==lag(B)&
C==lag(C)&
D==lag(D)))%>%
mutate(twice = lead(flag)==T)%>%
mutate(E = ifelse(flag == T | twice ==T,1,0))%>%
mutate(E = ifelse(is.na(E),0,1))%>%
mutate(FF = ifelse( ( (A +lag(A)) + (B +lag(B)) + (C+lag(C)) + (D + lag(D))) == 7,1,0))%>%
mutate(FF = ifelse(is.na(FF)| FF == 0,0,1))%>%
select(A,B,C,D,E,FF)
Result:
A B C D E FF
1 1 0 1 1 1 0
2 1 0 1 1 1 0
3 0 1 1 1 1 0
4 0 1 1 1 1 0
5 1 1 1 1 0 1

Get names of column with max value for each row

i need your help, i have a data frame like this
int x y z
1 0 1 0
2 1 0 0
3 0 0 1
and the result that i need must be like this
int letter
1 y
2 x
3 z
my code is:
for (i in 1:nrow(samples))
for(j in 1:ncol(samples))
if(samples[i,][,j] == 1) print(c(i,names(samples[i,j])))
but it is not showing the second column and i need save in a new data.frame, any suggestion? thanks.
You can use max.col:
dat$newcol <- names(DF)[-1][max.col(DF[-1])]
This gives
int x y z newcol
1 1 0 1 0 y
2 2 1 0 0 x
3 3 0 0 1 z
I'm sure there are plenty of ways, but here's one:
samples <- read.table(text="int x y z
1 0 1 0
2 1 0 0
3 0 0 1",
header=TRUE)
# int x y z
#1 1 0 1 0
#2 2 1 0 0
#3 3 0 0 1
data.frame(
samples[1],
letter=colnames(samples[-1][apply(samples[-1],1,which.max)])
)
# int letter
#1 1 y
#2 2 x
#3 3 z
A solution to this similar question.
tdf <- data.frame(
A = c(1,1,0,0),
B = c(0,0,1,0),
C = c(0,0,0,1)
)
library(magrittr)
tdf %>%
lapply(sum) %>%
(function(x){
a <- c()
for(i in 1:length(x)){
a <- c(a, rep(names(x[i]), x[i]))
}
return(a)
})

Aggregating every 10 columns in binary matrice

I am new to R.
I would like to transform a binary matrix like this:
example:
" 1874 1875 1876 1877 1878 .... 2009
F 1 0 0 0 0 ... 0
E 1 1 0 0 0 ... 0
D 1 1 0 0 0 ... 0
C 1 1 0 0 0 ... 0
B 1 1 0 0 0 ... 0
A 1 1 0 0 0 ... 0"
Since, columns names are years I would like to aggregate them in decades and obtain something like:
"1840-1849 1850-1859 1860-1869 .... 2000-2009
F 1 0 0 0 0 ... 0
E 1 1 0 0 0 ... 0
D 1 1 0 0 0 ... 0
C 1 1 0 0 0 ... 0
B 1 1 0 0 0 ... 0
A 1 1 0 0 0 ... 0"
I am used to python and do not know how to do this transformation without making loops!
Thanks, isabel
It is unclear what aggregation you want, but using the following dummy data
set.seed(42)
df <- data.frame(matrix(sample(0:1, 6*25, replace = TRUE), ncol = 25))
names(df) <- 1874 + 0:24
The following counts events in each 10-year period.
Get the years as a numeric variable
years <- as.numeric(names(df))
Next we need an indicator for the start of each decade
ind <- seq(from = signif(years[1], 3), to = signif(tail(years, 1), 3), by = 10)
We then apply over the indices of ind (1:(length(ind)-1)), select columns from df that are the current decade and count the 1s using rowSums.
tmp <- lapply(seq_along(ind[-1]),
function(i, inds, data) {
rowSums(data[, names(data) %in% inds[i]:(inds[i+1]-1)])
}, inds = ind, data = df)
Next we cbind the resulting vectors into a data frame and fix-up the column names:
out <- do.call(cbind.data.frame, tmp)
names(out) <- paste(head(ind, -1), tail(ind, -1) - 1, sep = "-")
out
This gives:
> out
1870-1879 1880-1889 1890-1899
1 4 5 6
2 4 6 6
3 2 5 5
4 5 5 7
5 3 3 7
6 5 5 4
If you want simply a binary matrix with a 1 indicating at least 1 event happened in that decade, then you can use:
tmp2 <- lapply(seq_along(ind[-1]),
function(i, inds, data) {
as.numeric(rowSums(data[, names(data) %in% inds[i]:(inds[i+1]-1)]) > 0)
}, inds = ind, data = df)
out2 <- do.call(cbind.data.frame, tmp2)
names(out2) <- paste(head(ind, -1), tail(ind, -1) - 1, sep = "-")
out2
which gives:
> out2
1870-1879 1880-1889 1890-1899
1 1 1 1
2 1 1 1
3 1 1 1
4 1 1 1
5 1 1 1
6 1 1 1
If you want a different aggregation, then modify the function applied in the lapply call to use something other than rowSums.
This is another option, using modular arithmetic to aggregate the columns.
# setup, borrowed from #GavinSimpson
set.seed(42)
df <- data.frame(matrix(sample(0:1, 6*25, replace = TRUE), ncol = 25))
names(df) <- 1874 + 0:24
result <- do.call(cbind,
by(t(df), as.numeric(names(df)) %/% 10 * 10, colSums))
# add -xxx9 to column names, for each decade
dimnames(result)[[2]] <- paste(colnames(result), as.numeric(colnames(result)) + 9, sep='-')
# 1870-1879 1880-1889 1890-1899
# V1 4 5 6
# V2 4 6 6
# V3 2 5 5
# V4 5 5 7
# V5 3 3 7
# V6 5 5 4
If you wanted to aggregate with something other than sum, replace the call to
colSums with something like function(cols) lapply(cols, f), where f is the aggregating
function, e.g., max.

Resources