Improve sampling between R dataframes - r

I am trying to improve speed in a case where I need to use data from dataframes and sample from other dataframes.
First I need to draw the number of samples I want from df_obs.
Then I need to determine a subset from where to sample based on which month I am in (the subset command).
Then I want to sample from the corresponding sample dataframe.
And finally put it all together in a new dataframe.
The code below Works, but it is far to slow, when I have to repeat this 1000 times. Is there an alternative method which uses apply functions better? Or perhaps some data.table function?
#Sample function to sample correct in case of only one value to sample from
resample <- function(x, ...) x[sample.int(length(x), ...)]
#Creating dummy data
no_pr_month <- sort(sample(67:120, 20))
df_obs <- data.frame(replicate(20,sample(0:5,1000,rep=TRUE)))
colnames(df_obs) <- no_pr_month
amount <- sample(50:50000,200)
month <- sample(no_pr_month,200, rep=TRUE)
df <- data.frame(month,amount)
df_sum <- data.frame(matrix(NA, ncol = 20, nrow=1000))
#The far too slow loop
for (k in 1:length(no_pr_month)){
a <- df_obs[,k]
df_sample <- subset(df, df$month == names(df_obs[k]))
df_out <- sapply(a, function(x) sum(resample(df_sample$amount, x,replace = TRUE)))
df_sum[,k] <- df_out
}

Note: before creating your data, I inserted set.seed(000) for consistent results
Even when using the data.table package, it's best to keep data organized in a "tidy" way: row-based.
So we'll start by changing your df_obs dataset to a long-form data.table.
library(data.table)
df_obs_long <- data.table(
month = as.integer(rep(names(df_obs), each = nrow(df_obs))),
obs_count = unlist(df_obs)
)
df_obs_long
# month obs_count
# 1: 69 4
# 2: 69 5
# 3: 69 1
# 4: 69 3
# 5: 69 0
# ---
# 19996: 116 4
# 19997: 116 1
# 19998: 116 2
# 19999: 116 3
# 20000: 116 5
Next we'll define a function that takes a vector of sample sizes and the number of the month to draw samples from. The function will return a vector of sample sums for each of the sizes given.
Making df a data.table doesn't save much as far as written code, but can cut down runtime by a good amount.
setDT(df)
sample_and_sum_month <- function(sizes, month_number) {
choices <- df[month == month_number, amount]
vapply(
sizes,
FUN.VALUE = numeric(1),
FUN = function(s) {
sum(resample(choices, size = s, replace = TRUE))
}
)
}
sample_and_sum_month(1:3, 69)
# [1] 12729 55068 28605
Finally, we can just add the sums as a new column in df_obs_long.
df_obs_long[
,
sample_sum := sample_and_sum_month(obs_count, .BY[["month"]]),
by = "month"
]
df_obs_long
# month obs_count sample_sum
# 1: 69 4 82662
# 2: 69 5 160761
# 3: 69 1 5743
# 4: 69 3 108783
# 5: 69 0 0
# ---
# 19996: 116 4 56792
# 19997: 116 1 22570
# 19998: 116 2 35337
# 19999: 116 3 64734
# 20000: 116 5 69075

Related

markov transition matrix from sequence of doctor visits for different patients

I am trying to create a markov transition matrix from sequence of doctor visits for different patients. In my markov model states are the different doctors and connections are visits by patients. A patient can stay with the same provider or transition to another for the next visit. Using that information I need to create a transition matrix.
Here is a part of the data in excel. Data includes more than 30K visits to almost 100 different providers.
Here is the part of the data in excel.
data
How can I use this excel data (or csv) and create a Markov transition matrix as number of visits, such as:
....
The matrix I need will look like this:
enter image description here
How can I transform my data to transition matrix with R?
I am fairly new with R and really need help.
Thank you
Here's an approach that works with your sample data.
I'll use readxl to get the data and data.table to manipulate it.
Reading data:
library(readxl)
library(data.table)
data <- setDT(read_excel("~/Desktop/Book2.xlsx"))[!is.na(PatId)]
#read_excel doesn't have the option to specify integers... silly...
data[ , (names(data)) := lapply(.SD, as.integer)]
Pre-allocate transition matrix:
provs <- data[ , sort(unique(SeenByProv))]
nprov <- length(provs)
markov <- matrix(nrow = nprov, ncol = nprov,
dimnames = list(provs, provs))
Assign row-by-row
for (pr in provs){
markov[as.character(pr), ] <-
data[ , {nxt <- SeenByProv[which(SeenByProv == pr) + 1L]
.(prov = provs, count =
sapply(provs, function(pr2) sum(nxt == pr2, na.rm = TRUE)))}, by = PatId
][, sum(count), by = prov]$V1
}
This can probably be sped up in a few places, but it works.
I wanted to compare my method without using data.table and found it was 45x faster (and probably more straightforward to understand).
First, I time the data.table solution from the accepted answer:
rm(list=ls())
library(readxl)
library(data.table)
############## Using data.table method() ######################
data <- setDT(read_excel("Book2.xlsx"))[!is.na(PatId)]
data[ , (names(data)) := lapply(.SD, as.integer)]
provs <- data[ , sort(unique(SeenByProv))]
nprov <- length(provs)
markov <- matrix(nrow = nprov, ncol = nprov, dimnames = list(provs, provs))
system.time( ## Timing the main loop
for (pr in provs){
markov[as.character(pr), ] <-
data[ , {nxt <- SeenByProv[which(SeenByProv == pr) + 1L]
.(prov = provs, count =
sapply(provs, function(pr2) sum(nxt == pr2, na.rm = TRUE)))}, by = PatId
][, sum(count), by = prov]$V1
}
)
# user system elapsed
# 3.128 0.000 3.135
table(markov)
#markov
# 0 1 2 3 4 5 6 7 8 9 10 11 13 22 140
#3003 308 89 34 14 11 6 4 1 3 4 1 1 1 1
Next using only base R calls:
############## Using all base R calls method() ###################
tm_matrix<-matrix(0, nrow = nprov, ncol = nprov, dimnames = list(provs, provs))
d<-read_excel("Book2.xlsx")
d<-d[!is.na(d$PatId),] # Note: Data is already ordered by PatId, DaysOfStudy
baseR<-function(tm_matrix){
d1<-cbind(d[-nrow(d),-3],d[-1,-3]); # Form the transitions and drop the DaysofStudy
colnames(d1)<-c("SeenByProv","PatId","NextProv","PatId2");
d1<-d1[d1$PatId==d1$PatId2,]; # Drop those transition between different patients
d1$SeenByProv<-as.character(d1$SeenByProv); # transform to strings to use as rownames
d1$NextProv <-as.character(d1$NextProv); # and column names
for (i in 1:nrow(d1)){ # Fill in the transition matrix
tm_matrix[d1$SeenByProv[i],d1$NextProv[i]]<-tm_matrix[d1$SeenByProv[i],d1$NextProv[i]]+1
};
return(tm_matrix)
}
system.time(tm_matrix<-baseR(tm_matrix))
# user system elapsed
# 0.072 0.000 0.072
table(tm_matrix)
#tm_matrix
# 0 1 2 3 4 5 6 7 8 9 10 11 13 22 140
#3003 308 89 34 14 11 6 4 1 3 4 1 1 1 1
all.equal(markov,tm_matrix)
#[1] TRUE
My base-R method is 3.135/0.072 = 43.54 faster

Find top deciles from dataframe by group

I am attempting to create new variables using a function and lapply rather than working right in the data with loops. I used to use Stata and would have solved this problem with a method similar to that discussed here.
Since naming variables programmatically is so difficult or at least awkward in R (and it seems you can't use indexing with assign), I have left the naming process until after the lapply. I am then using a for loop to do the renaming prior to merging and again for the merging. Are there more efficient ways of doing this? How would I replace the loops? Should I be doing some sort of reshaping?
#Reproducible data
data <- data.frame("custID" = c(1:10, 1:20),
"v1" = rep(c("A", "B"), c(10,20)),
"v2" = c(30:21, 20:19, 1:3, 20:6), stringsAsFactors = TRUE)
#Function to analyze customer distribution for each category (v1)
pf <- function(cat, df) {
df <- df[df$v1 == cat,]
df <- df[order(-df$v2),]
#Divide the customers into top percents
nr <- nrow(df)
p10 <- round(nr * .10, 0)
cat("Number of people in the Top 10% :", p10, "\n")
p20 <- round(nr * .20, 0)
p11_20 <- p20-p10
cat("Number of people in the 11-20% :", p11_20, "\n")
#Keep only those customers in the top groups
df <- df[1:p20,]
#Create a variable to identify the percent group the customer is in
top_pct <- integer(length = p10 + p11_20)
#Identify those in each group
top_pct[1:p10] <- 10
top_pct[(p10+1):p20] <- 20
#Add this variable to the data frame
df$top_pct <- top_pct
#Keep only custID and the new variable
df <- subset(df, select = c(custID, top_pct))
return(df)
}
##Run the customer distribution function
v1Levels <- levels(data$v1)
res <- lapply(v1Levels, pf, df = data)
#Explore the results
summary(res)
# Length Class Mode
# [1,] 2 data.frame list
# [2,] 2 data.frame list
print(res)
# [[1]]
# custID top_pct
# 1 1 10
# 2 2 20
#
# [[2]]
# custID top_pct
# 11 1 10
# 16 6 10
# 12 2 20
# 17 7 20
##Merge the two data frames but with top_pct as a different variable for each category
#Change the new variable name
for(i in 1:length(res)) {
names(res[[i]])[2] <- paste0(v1Levels[i], "_top_pct")
}
#Merge the results
res_m <- res[[1]]
for(i in 2:length(res)) {
res_m <- merge(res_m, res[[i]], by = "custID", all = TRUE)
}
print(res_m)
# custID A_top_pct B_top_pct
# 1 1 10 10
# 2 2 20 20
# 3 6 NA 10
# 4 7 NA 20
Stick to your Stata instincts and use a single data set:
require(data.table)
DT <- data.table(data)
DT[,r:=rank(v2)/.N,by=v1]
You can see the result by typing DT.
From here, you can group the within-v1 rank, r, if you want to. Following Stata idioms...
DT[,g:={
x = rep(0,.N)
x[r>.8] = 20
x[r>.9] = 10
x
}]
This is like gen and then two replace ... if statements. Again, you can see the result with DT.
Finally, you can subset with
DT[g>0]
which gives
custID v1 v2 r g
1: 1 A 30 1.000 10
2: 2 A 29 0.900 20
3: 1 B 20 0.975 10
4: 2 B 19 0.875 20
5: 6 B 20 0.975 10
6: 7 B 19 0.875 20
These steps can also be chained together:
DT[,r:=rank(v2)/.N,by=v1][,g:={x = rep(0,.N);x[r>.8] = 20;x[r>.9] = 10;x}][g>0]
(Thanks to #ExperimenteR:)
To rearrange for the desired output in the OP, with values of v1 in columns, use dcast:
dcast(
DT[,r:=rank(v2)/.N,by=v1][,g:={x = rep(0,.N);x[r>.8] = 20;x[r>.9] = 10;x}][g>0],
custID~v1)
Currently, dcast requires the latest version of data.table, available (I think) from Github.
You don't need the function pf to achieve what you want. Try dplyr/tidyr combo
library(dplyr)
library(tidyr)
data %>%
group_by(v1) %>%
arrange(desc(v2))%>%
mutate(n=n()) %>%
filter(row_number() <= round(n * .2)) %>%
mutate(top_pct= ifelse(row_number()<=round(n* .1), 10, 20)) %>%
select(custID, top_pct) %>%
spread(v1, top_pct)
# custID A B
#1 1 10 10
#2 2 20 20
#3 6 NA 10
#4 7 NA 20
The idiomatic way to do this kind of thing in R would be to use a combination of split and lapply. You're halfway there with your use of lapply; you just need to use split as well.
lapply(split(data, data$v1), function(df) {
cutoff <- quantile(df$v2, c(0.8, 0.9))
top_pct <- ifelse(df$v2 > cutoff[2], 10, ifelse(df$v2 > cutoff[1], 20, NA))
na.omit(data.frame(id=df$custID, top_pct))
})
Finding quantiles is done with quantile.

Random subset/sample of dataframe

I have a doubt about how to sample a dataframe. The dataset is like this:
with 114 rows with 9 columns. I need to extract 3 subsets, each one of 38 rows (114 / 3).
I have this script, but it doesn't works for the last subset:
install.packages("Rcmdr")
library(Rcmdr)
ana <- read.delim("~/Desktop/ana", header=TRUE, dec=",")
set1 <- ana[sample(nrow(ana), 38), ]
set1.index <- as.numeric(rownames(set1))
ana2 <- ana[(-set1.index),]
set2 <- ana2[sample(nrow(ana2), 38), ]
set2.index <- as.numeric(rownames(set2))
ana3 <- ana2[(-set2.index),]
ana3
For set1 and set2 I get the subsets correctly, but for set3 I get 50 rows (or less).
(Thank you in advance! =) )
Generally #docendodiscimus give valid advice but the sampling code he offers will not guarantee equal numbers in the subsets (see below). Try this instead:
set.seed(123) # best to set a seed to allow roproducibility
sampidx <- sample( rep(1:3, each=38)
set1 <- ana[sampidx==1, ] # logical indexing of dataframe
set2 <- ana[sampidx==2, ]
set3 <- ana[sampidx==3, ]
Lack of equipartition with sample using replacement:
> table( sample(1:3, nrow(iris), replace = TRUE) )
1 2 3
52 52 46
> table( sample(1:3, nrow(iris), replace = TRUE) )
1 2 3
51 49 50 # notice that it also varies from draw to draw
> table(sampidx)
sampidx
1 2 3
38 38 38

Creating combination of sequences

I am trying to solve following problem:
Consider 5 simple sequences: 0:100, 100:0, rep(0,101), rep(50,101), rep(100,101)
I need sets of 3 numeric variables, which have above sequences in all combinations. Since there are 5 sequences and 3 variables, there can be 5*5*5 combinations, hence total of 12625 (5*5*5*101) numbers in each variable (101 for each sequence).
These can be grouped in a data.frame of 12625 rows and 4 columns. First column (V) will simply have seq(1:12625) (rownumbers can be used in its place). Other 3 columns (A,B,C) will have above 5 sequences in different combinations. For example, the first 101 rows will have 0:100 in all 3 A,B and C. Next 101 rows will have 0:100 in A and B, and 100:0 in C. And so on...
I can create sequences as:
s = list()
s[[1]] = 0:100
s[[2]] = 100:0
s[[3]] = rep(0,101)
s[[4]] = rep(50,101)
s[[5]] = rep(100,101)
But how to proceed further? I do not really need the data frame but I need a function that returns a list containing the values of c(A,B,C) for the number (first or V column) sent to it. The number can obviously vary from 1 to 12625.
How can I create such a function. I will prefer a vector solution or one using apply family functions to optimize the speed.
You asked for a vectorized solution, so here's one using only data.table (similar to #SimonGs methodology)
library(data.table)
grd <- CJ(A = seq_len(5), B = seq_len(5), C = seq_len(5))
res <- grd[, lapply(.SD, function(x) unlist(s[x]))]
res
# A B C
# 1: 0 0 0
# 2: 1 1 1
# 3: 2 2 2
# 4: 3 3 3
# 5: 4 4 4
# ---
# 12621: 100 100 100
# 12622: 100 100 100
# 12623: 100 100 100
# 12624: 100 100 100
# 12625: 100 100 100
I came up with two solutions. I find this hard to do with apply and the likes since they tend to give an output that is not so nice to handle (maybe someone can "tame" them better than I can :D)
First solution uses seperate calls to lapply, second one uses a for loop and some programming No-No's. Personally I prefer the second one, first one is faster though...
grd <- expand.grid(a=1:5,b=1:5,c=1:5)
# apply-ish
A <- lapply(grd[,1], function(z){ s[[z]] })
B <- lapply(grd[,2], function(z){ s[[z]] })
C <- lapply(grd[,3], function(z){ s[[z]] })
dfr <- data.frame(A=do.call(c,A), B=do.call(c,B), C=do.call(c,C))
# for-ish
mat <- NULL
for(i in 1:nrow(grd)){
cur <- grd[i,]
tmp <- cbind(s[[cur[,1]]],s[[cur[,2]]],s[[cur[,3]]])
mat <- rbind(mat,tmp)
}
The output of both dfr and mat seem to be what you describe.
Cheers!

Aggregate over categories that contain NAs with ddply and lapply?

I would like to aggregate a data.frame over 3 categories, with one of them varying. Unfortunately this one varying category contains NAs (actually it's the reason why it needs to vary). Thus I created a list of data.frames. Every data.frame within this list contains only complete cases with respect to three variables (with only one of them changing).
Let's reproduce this:
library(plyr)
mydata <- warpbreaks
names(mydata) <- c("someValue","group","size")
mydata$category <- c(1,2,3)
mydata$categoryA <- c("A","A","X","X","Z","Z")
# add some NA
mydata$category[c(8,10,19)] <- NA
mydata$categoryA[c(14,1,20)] <- NA
# create a list of dfs that contains TRUE FALSE
noNAList <- function(vec){
res <- !is.na(vec)
return(res)
}
testTF <- lapply(mydata[,c("category","categoryA")],noNAList)
# create a list of data.frames
selectDF <- function(TFvec){
res <- mydata[TFvec,]
return(res)
}
# check x and see that it may contain NAs as long
# as it's not in one of the 3 categories I want to aggregate over
x <-lapply(testTF,selectDF)
## let's ddply get to work
doddply <- function(df){
ddply(df,.(group,size),summarize,sumTest = sum(someValue))
}
y <- lapply(x, doddply);y
y comes very close to what I want to get
$category
group size sumTest
1 A L 375
2 A M 198
3 A H 185
4 B L 254
5 B M 259
6 B H 169
$categoryA
group size sumTest
1 A L 375
2 A M 204
3 A H 200
4 B L 254
5 B M 259
6 B H 169
But I need to implement aggregation over a third varying variable, which is in this case category and categoryA. Just like:
group size category sumTest sumTestTotal
1 A H 1 46 221
2 A H 2 46 221
3 A H 3 93 221
and so forth. How can I add names(x) to lapply, or do I need a loop or environment here?
EDIT:
Note that I want EITHER category OR categoryA added to the mix. In reality I have about 15 mutually exclusive categorical vars.
I think you might be making this really hard on yourself, if I understand your question correctly.
If you want to aggregate the data.frame 'myData' by three (or four) variables, you would simply do this:
aggregate(someValue ~ group + size + category + categoryA, sum, data=mydata)
group size category categoryA someValue
1 A L 1 A 51
2 B L 1 A 19
3 A M 1 A 17
4 B M 1 A 63
aggregate will automatically remove rows that include NA in any of the categories. If someValue is sometimes NA, then you can add the parameter na.rm=T.
I also noted that you put a lot of unnecessary code into functions. For example:
# create a list of data.frames
selectDF <- function(TFvec){
res <- mydata[TFvec,]
return(res)
}
Can be written like:
selectDF <- function(TFvec) mydata[TFvec,]
Also, using lapply to create a list of two data frames without the NA is overkill. Try this code:
x = list(mydata[!is.na(mydata$category),],mydata[!is.na(mydata$categoryA),])
I know the question explicitly requests a ddply()/lapply() solution.
But ... if you are willing to come on over to the dark side, here is a data.table()-based function that should do the trick:
# Convert mydata to a data.table
library(data.table)
dt <- data.table(mydata, key = c("group", "size"))
# Define workhorse function
myfunction <- function(dt, VAR) {
E <- as.name(substitute(VAR))
dt[i = !is.na(eval(E)),
j = {n <- sum(.SD[,someValue])
.SD[, list(sumTest = sum(someValue),
sumTestTotal = n,
share = sum(someValue)/n),
by = VAR]
},
by = key(dt)]
}
# Test it out
s1 <- myfunction(dt, "category")
s2 <- myfunction(dt, "categoryA")
ADDED ON EDIT
Here's how you could run this for a vector of different categorical variables:
catVars <- c("category", "categoryA")
ll <- lapply(catVars,
FUN = function(X) {
do.call(myfunction, list(dt, X))
})
names(ll) <- catVars
lapply(ll, head, 3)
# $category
# group size category sumTest sumTestTotal share
# [1,] A H 2 46 185 0.2486486
# [2,] A H 3 93 185 0.5027027
# [3,] A H 1 46 185 0.2486486
#
# $categoryA
# group size categoryA sumTest sumTestTotal share
# [1,] A H A 79 200 0.395
# [2,] A H X 68 200 0.340
# [3,] A H Z 53 200 0.265
Finally, I found a solution that might not be as slick as Josh' but it works without no dark forces (data.table). You may laugh – here's my reproducible example using the same sample data as in the question.
qual <- c("category","categoryA")
# get T / F vectors
noNAList <- function(vec){
res <- !is.na(vec)
return(res)
}
selectDF <- function(TFvec) mydata[TFvec,]
NAcheck <- lapply(mydata[,qual],noNAList)
# create a list of data.frames
listOfDf <- lapply(NAcheck,selectDF)
workhorse <- function(charVec,listOfDf){
dfs <- list2env(listOfDf)
# create expression list
exlist <- list()
for(i in 1:length(qual)){
exlist[[qual[i]]] <- parse(text=paste("ddply(",qual[i],
",.(group,size,",qual[i],"),summarize,sumTest = sum(someValue))",
sep=""))
}
res <- lapply(exlist,eval,envir=dfs)
return(res)
}
Is this more like what you mean? I find your example extremely difficult to understand. In the below code, the method can take any column, and then aggregate by it. It can return multiple aggregation functions of someValue. I then find all the column names you would like to aggregate by, and then apply the function to that vector.
# Build a method to aggregate by column.
agg.by.col = function (column) {
by.list=list(mydata$group,mydata$size,mydata[,column])
names(by.list) = c('group','size',column)
aggregate(mydata$someValue, by=by.list, function(x) c(sum=sum(x),mean=mean(x)))
}
# Find all the column names you want to aggregate by
cols = names(mydata)[!(names(mydata) %in% c('someValue','group','size'))]
# Apply the method to each column name.
lapply (cols, agg.by.col)

Resources