Quicken this for loop? - r

I have a dataset called cpue with 3.3 million rows. I have made a subset of this dataframe called dat.frame. (See below for the heads of cpue and dat.frame.) I have added two new fields to dat.frame: "ssh_vec" and "ssh_mag". Although the heads of cpue and dat.frame look the same, the rest of the rows are not actually in the same order.
head(cpue)
code event Lat Long stat_area Day Month Year id
1 BCO 447602 -43.45 182.73 49 17 3 1995 1
head(dat.frame)
code event Lat Long stat_area Day Month Year id cal.jdate ssh_vec ssh_mag
1 BCO 447602 -43.45 182.73 49 17 3 1995 1 2449857 56.83898 4.499350
Currently, I am running a loop to add the ssh_vec and ssh_mag variables to "cpue" using the unique identifier "id":
cpue$ssh<- NA
cpue$sshmag<- NA
for(i in 1:nrow(dat.frame))
{
ndx<- dat.frame$id[i]
cpue_full$ssh[ndx]<- dat.frame$ssh_vec[i]
cpue_full$sshmag[ndx]<- dat.frame$ssh_mag[i]
}
This has been running over the weekend and is only up to:
i
[1] 132778
... out of:
nrow(dat.frame)
[1] 2797789
Within the loop, there is nothing that looks too computationally demanding. Is there a better alternative?

Are you sure you need a for loop at all? I think this might be equivalent:
cpue_full$ssh[dat.frame$id]<- dat.frame$ssh_vec
cpue_full$sshmag[dat.frame$id]<- dat.frame$ssh_mag

I would recommend taking a look at data.table. Since I don't have your data, here is a simple example using dummy data.
library(data.table)
N = 10^6
dat <- data.table(
x = rnorm(1000),
g = sample(LETTERS, N, replace = TRUE)
)
dat2 <- dat[,list(mx = mean(x)),g]
h = merge(dat, dat2, 'g')

Do you even need to loop? From the code fragment posted it would appear not.
cpue_full$ssh[dat.frame$id] <- dat.frame$ssh_vec
cpue_full$sshmag[dat.frame$id]<- dat.frame$ssh_mag
should work. A quick (and small) dummy example:
set.seed(666)
ssh <- rnorm(10^4)
datf <- data.frame(id = sample.int(10000L), ssh = NA)
system.time(datf$ssh[datf$id] <- ssh) # user 0, system 0, elapsed 0
# Reset dummy data
datf$ssh <- NA
system.time({
for (i in 1:nrow(datf) ) {
ndx <- datf$id[i]
datf$ssh[ndx] <- ssh[i]
}
} ) # user 2.26, system 0.02, elapsed 2.28
PS - I've not used the data.table package, so I don't follow Ramnath's answer. In general you should avoid loops if possible (see fortune(142) and Circle 3 of The R Inferno).

Related

Looping row numbers from one dataframe to create new data using logical operations in R

I would like to extract a dataframe that shows how many years it takes for NInd variable (dataset p1) to recover due to some culling happening, which is showed in dataframe e1.
I have the following datasets (mine are much bigger, but just to give you something to play with):
# Dataset 1
Batch <- c(2,2,2,2,2,2,2,2,2,2)
Rep <- c(0,0,0,0,0,0,0,0,0,0)
Year <- c(0,0,1,1,2,2,3,3,4,4)
RepSeason <- c(0,0,0,0,0,0,0,0,0,0)
PatchID <- c(17,25,19,16,21,24,23,20,18,33)
Species <- c(0,0,0,0,0,0,0,0,0,0)
Selected <- c(1,1,1,1,1,1,1,1,1,1)
Nculled <- c(811,4068,1755,449,1195,1711,619,4332,457,5883)
e1 <- data.frame(Batch,Rep,Year,RepSeason,PatchID,Species,Selected,Nculled)
# Dataset 2
Batch <- c(2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2)
Rep <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
Year <- c(0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2)
RepSeason <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
PatchID <- c(17,25,19,16,21,24,23,20,18,33,17,25,19,16,21,24,23,20,18,33,17,25,19,16,21,24,23,20,18,33)
Ncells <- c(6,5,6,4,4,5,6,5,5,5,6,5,6,4,4,5,6,7,3,5,4,4,3,3,4,4,5,5,6,4)
Species <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
NInd <- c(656,656,262,350,175,218,919,218,984,875,700,190,93,127,52,54,292,12,43,68,308,1000,98,29,656,656,262,350,175,300)
p1 <- data.frame(Batch, Rep, Year, RepSeason, PatchID, Ncells, Species, NInd)
The dataset called e1 shows only those year where some culled happened to the population on specific PatchID.
I have created the following script that basically use each row from e1 to create a Recovery number. Maybe there is an easier way to get to the end, but this is the one I managed to get...
When you run this, you are working on ONE row of e1, so we focus on the first PatchID encounter and then do some calculation to match that up with p1, and finally I get a number named Recovery.
Now, the thing is my dataframe has 50,000 rows, so doing this over and over looks quite tedious. So, that's where I thought a loop may be useful. But have tried and no luck on how to make it work at all...
#here is where I would like the loop
e2 <- e1[1,] # Trial for one row only # but the idea is having here a loop that keep doing of comes next for each row
e3 <- e2 %>%
select(1,2,4,5)
p2 <- p1[,c(1,2,4,5,3,6,7,8)] # Re-order
row2 <- which(apply(p2, 1, function(x) return(all(x == e3))))
p3 <- p1 %>%
slice(row2) # all years with that particular patch in that particular Batch
#How many times was this patch cull during this replicate?
e4 <- e2[,c(1,2,4,5,3,6,7,8)]
e4 <- e4 %>%
select(1,2,3,4)
c_batch <- e1[,c(1,2,4,5,3,6,7,8)]
row <- which(apply(c_batch, 1, function(x) return(all(x == e4))))
c4 <- c_batch %>%
slice(row)
# Number of year to recover to 95% that had before culled
c5 <- c4[1,] # extract the first time was culled
c5 <- c5 %>%
select(1:5)
row3 <- which(apply(p2, 1, function(x) return(all(x == c5))))
Before <- p2 %>%
slice(row3)
NInd <- Before[,8] # Before culling number of individuals
Year2 <- Before[,5] # Year number where first culling happened (that actually the number corresponds to individuals before culling, as the Pop file is developed during reproduction, while Cull file is developed after!)
Percent <- (95*NInd)/100 # 95% recovery we want to achieve would correspond to having 95% of NInd BEFORE culled happened (Year2)
After <- p3 %>%
filter(NInd >= Percent & Year > Year2) # Look rows that match number of ind and Year
After2 <- After[1,] # we just want the first year where the recovery was successfully achieved
Recovery <- After2$Year - Before$Year
# no. of years to reach 95% of the population immediately before the cull
I reckon that the end would have to change somehow to to tell R that we are creating a dataframe with the Recovery, something like:
Batch <- c(1,1,2,2)
Rep <- c(0,0,0,0)
PatchID <- c(17,25,30,12)
Recovery <- c(1,2,1,5)
Final <- data.frame(Batch, Rep, PatchID, Recovery)
Would that be possible? OR this is just too mess-up and I may should try a different way?
Does the following solve the problem correectly?
I have first added a unique ID to your data.frames to allow matching of the cull and population files (this saves most of you complicated look-up code):
# Add a unique ID for the patch/replicate etc. (as done in the example code)
e1$RepID = paste(e1$Batch, e1$Rep, e1$RepSeason, e1$PatchID, sep = ":")
p1$RepID = paste(p1$Batch, p1$Rep, p1$RepSeason, p1$PatchID, sep = ":")
If you want a quick overview of the number of times each patch was culled, the new RepID makes this easy:
# How many times was each patch culled?
table(p1$RepID)
Then you want a loop to check the recovery time after each cull.
My solutions uses an sapply loop (which also retains the RepIDs so you can match to other metadata later):
sapply(unique(e1$RepID), function(rep_id){
all_cull_events = e1[e1$RepID == rep_id, , drop = F]
first_year = order(all_cull_events$Year)[1] # The first cull year (assuming data might not be in temporal order)
first_cull_event = all_cull_events[first_year, ] # The row corresponding to the first cull event
population_counts = p1[p1$RepID == first_cull_event$RepID, ] # The population counts for this plot/replicate
population_counts = population_counts[order(population_counts$Year), ] # Order by year (assuming data might not be in temporal order)
pop_at_first_cull_event = population_counts[population_counts$Year == first_cull_event$Year, "NInd"]
population_counts_after_cull = population_counts[population_counts$Year > first_cull_event$Year, , drop = F]
years_to_recovery = which(population_counts_after_cull$NInd >= (pop_at_first_cull_event * .95))[1] # First year to pass 95% threshold
return(years_to_recovery)
})
2:0:0:17 2:0:0:25 2:0:0:19 2:0:0:16 2:0:0:21 2:0:0:24 2:0:0:23 2:0:0:20 2:0:0:18 2:0:0:33
1 2 1 NA NA NA NA NA NA NA
(The output contains some NAs because the first cull year was outside the range of population counts in the data you gave us)
Please check this against your expected output though. There were some aspects of the question and example code that were not clear (see comments).

Filtering a data frame in R using a for loop

I am new to R and am running into difficulty with more advanced filtering. I have a data frame containing 1500 rows of people in households and need to filter out everyone who is part of a household where at least 1 person is older than 24. For example, in the sample set below I would only want to keep rows 3,4, and 5.
PersonalID DOB HouseholdID
1 1961-04-15 123
2 2017-01-12 123
3 2000-01-02 122
4 2001-03-05 122
5 1996-08-22 122
Initially I just filtered to get a new data frame with everyone in that age range and then filtered the original data frame again (and again and again and so on...) with each HouseholdID of someone under 25 to check if anyone else with that HouseholdID is over 24.
Whenever I'm doing the same thing over and over it seems like there's probably a way to use a function instead but I'm having a hard time coming up with one that works. This is my current attempt but I know there's plenty wrong with it:
UNDER25df <- filter(df, DOB >= "yyyy-mm-dd")
for (UNDER25df$HouseholdID in df) {
if (all(df$DOB >= "yyyy-mm-dd")) {
view(filter(df, HouseholdID == "$HouseholdID"))
}
}
The error I get is:
unexpected '}' in "}"
but I'm pretty sure that I can nest an if statement in a for loop in R and that I've been careful about the positioning of the brackets so I don't know exactly what it's referring to.
What I'm not sure of is if I can iterate through a data frame in this way or if this even makes sense. I've read that vectoring might be better in general for advanced filtering but tried to read the documentation on it and couldn't really see how to make that jump to this problem. Does anyone have a suggestion or a direction I should be looking in?
You do not need a loop for this. Try
library(lubridate)
library(dplyr)
set.seed(1)
df <- tibble(DOB = Sys.Date() - sample(3000:12000, 6),
personalID = 1:6,
HouseholdID = c(1,1,2,2,2,3))
df$DOB
# grab householdID from all persons that are at least 24
oldies <- df[(lubridate::today() - lubridate::ymd(df$DOB)) > years(24),
"HouseholdID", TRUE]
# base R way
oldies <- df[as.Date(df$DOB) > as.Date("1993-2-10"),
"HouseholdID", TRUE]
# household members in a household with someone 24 or older
df %>%
filter(HouseholdID %in% oldies)
# household members in a household with noone 24 or older
df %>%
filter(!(HouseholdID %in% oldies))
I am not sure if you want keep the rows grouped by ID that all users are less than or equal to 24-year old. If so, then maybe you can try the code below
library(lubridate)
dfout <- subset(df, ave(floor(time_length(Sys.Date()-as.Date(DOB),"years"))<=24, HouseholdID, FUN = all))
If you really want to use for loop to make it, then the below is an example
dfout <- data.frame()
for (id in unique(df$HouseholdID)) {
subdf <- subset(df,HouseholdID == id)
if (with(subdf, all(floor(time_length(Sys.Date()-as.Date(DOB),"years"))<=24))) {
dfout <- rbind(dfout,subdf)
}
}
Both approaches above can give you the result shown as
> dfout
PersonalID DOB HouseholdID
3 3 2000-01-02 122
4 4 2001-03-05 122
5 5 1996-08-22 122
DATA
df <- structure(list(PersonalID = 1:5, DOB = c("1961-04-15", "2017-01-12",
"2000-01-02", "2001-03-05", "1996-08-22"), HouseholdID = c(123L,
123L, 122L, 122L, 122L)), class = "data.frame", row.names = c(NA,
-5L))
I am not sure if you want to select household where all the people are above 24 or at least one person is above 24. In any case, you can use subset with ave
subset(df, ave(as.integer(format(Sys.Date(), "%Y")) -
as.integer(format(DOB, "%Y")) >= 24, HouseholdID, FUN = any))
This selects households where at least one person is above 24. If you want to select households where all people are above 24 use all instead of any in FUN argument.
Similarly, using dplyr, we can use
library(dplyr)
df %>%
group_by(HouseholdID) %>%
filter(any(as.integer(format(Sys.Date(), "%Y")) -
as.integer(format(DOB, "%Y")) >= 24))

R: Loop through data.frame in row-pairs

I would like to process some GPS-Data rows, pairwise.
For now, I am doing it in a normal for-loop but I'm sure there is a better and faster way.
n = 100
testdata <- as.data.frame(cbind(runif(n,1,10), runif(n,0,360), runif(n,14,16), runif(n, 46,49)))
colnames(testdata) <- c("speed", "heading", "long", "lat")
head(testdata)
diffmatrix <- as.data.frame(matrix(ncol = 3, nrow = dim(testdata)[1] - 1))
colnames(diffmatrix) <- c("distance","heading_diff","speed_diff")
for (i in 1:(dim(testdata)[1] - 1)) {
diffmatrix[i,1] <- spDists(as.matrix(testdata[i:(i+1),c('long','lat')]),
longlat = T, segments = T)*1000
diffmatrix[i,2] <- testdata[i+1,]$heading - testdata[i,]$heading
diffmatrix[i,3] <- testdata[i+1,]$speed - testdata[i,]$speed
}
head(diffmatrix)
How would i do that with an apply-function?
Or is it even possible to do that calclulation in parallel?
Thank you very much!
I'm not sure what you want to do with the end condition but with dplyr you can do all of this without using a for loop.
library(dplyr)
testdata %>% mutate(heading_diff = c(diff(heading),0),
speed_diff = c(diff(speed),0),
longdiff = c(diff(long),0),
latdiff = c(diff(lat),0))
%>% rowwise()
%>% mutate(spdist = spDists(cbind(c(long,long + longdiff),c(lat,lat +latdiff)),longlat = T, segments = T)*1000 )
%>% select(heading_diff,speed_diff,distance = spdist)
# heading_diff speed_diff distance
# <dbl> <dbl> <dbl>
# 1 15.9 0.107 326496
# 2 -345 -4.64 55184
# 3 124 -1.16 25256
# 4 85.6 5.24 221885
# 5 53.1 -2.23 17599
# 6 -184 2.33 225746
I will explain each part below:
The pipe operator %>% is essentially a chain that sends the results from one operation into the next. So we start with your test data and send it to the mutate function.
Use mutate to create 4 new columns that are the difference measurements from one row to the next. Adding in 0 at the last row because there is no measurement following the last datapoint. (Could do something like NA instead)
Next once you have the differences you want to use rowwise so you can apply the spDists function to each row.
Last we create another column with mutate that calls the original 4 columns that we created earlier.
To get only the 3 columns that you were concerned with I used a select statement at the end. You can leave this out if you want the entire dataframe.

Calculate marginal totals as a function within a ddply call

I have been working on a file to calculate hospital infection rates. I want to standardise the infection rates to yearly procedure counts. The data are located here because it is too big for dput. SSI is the number of surgical infections(1 = infected, 0=not infected), Procedure is the type of procedure. Year has been derived using lubridate
library(plyr)
fname <- "https://raw.github.com/johnmarquess/some.data/master/hospG.csv"
download.file(fname, destfile='hospG.csv', method='wget')
hospG <- read.csv('hospG.csv')
Inf_table <- ddply(hospG, "Year", summarise,
Infections = sum(SSI == 1),
Procedures = length(Procedure),
PropInf = round(Infections/Procedures * 100 ,2)
)
This gives me the number of infections, procedures, and proportion infected per year for this hospital.
What I would like is an additional column with the standardised proportion infected. The long way to do this outside the inf_table is:
s1 <- sum(Inf_table$Infections)
s2 <- sum(Inf_table$Procedures)
Expected_prop_inf <- Inf_table$Procedures * s1/s2
Is there a way to get ddply to do this. I tied making a function with the calculation to produce Expected_prop_inf but I did not get very far.
Thanks for any help offered.
It's more difficult with ddply because you are dividing by a number outside the grouping . Better to do it with base R.
# base
> with(Inf_table, Procedures*(sum(Infections)/sum(Procedures)))
[1] 17.39184 17.09623 23.00847 20.84065 24.83141 24.83141
rather than with ddply which is not so natural:
# NB note .(Year) is unique for every row, you might also use rownames
> s1 <- sum(Inf_table$Infections)
> s2 <- sum(Inf_table$Procedures)
> ddply(Inf_table, .(Year), summarise, Procedures*(s1/s2))
Year ..1
1 2001 17.39184
2 2002 17.09623
3 2003 23.00847
4 2004 20.84065
5 2005 24.83141
6 2006 24.83141
Here is a solution to aggregate using data.table.
I'm not sure if it's posible to do it in one step.
require("data.table")
fname <- "https://raw.github.com/johnmarquess/some_data/master/hospG.csv"
hospG <- read.csv(fname)
Inf_table <- DT[, {Infections = sum(SSI == 1)
Procedures = length(Procedure)
PropInf = round(Infections/Procedures * 100 ,2)
list(
Infections = Infections,
Procedures = Procedures,
PropInf = PropInf
)
}, by = Year]
Inf_table[,Expected_prop_inf := list(Procedures * sum(Infections)/sum(Procedures))]
tables()
The added bonus of this approach is that you are not creating another data.table in the second step, a new column of the data.table is created. This would be relevant in case your datasets are bigger.

How to efficiently do aggregate on sparse data

I have a large dataset with 1008412 observations,
the columns are customer_id (int), visit_date (Date, format: "2010-04-04"), visit_spend (float).
This date function for the aggregate maps week numbers of interest to the range 13-65:
weekofperiod <- function(dt) {
as.numeric(format(as.Date(dt), "%W")) + 52 * (as.numeric(format(as.Date(dt), "%Y"))-2010)
}
Each customer_id has a variable number of total visits over a 53-week period.
For each customer_id, I want to get the aggregate of spend_per_week, by weekofperiod().
The code below is functionally correct but very slow - comments to make it faster?
Also, aggregate() produces sparse output where weeks without visits are missing, I initialize spend_per_week to 0, then row-wise manually assign the non-zero results from aggregate(), to make sure the result always has 53 rows. Surely that can be done better?
Sample dataset lines look like:
customer_id visit_date visit_spend
72 40 2011-03-15 18.38
73 40 2011-03-20 23.45
74 79 2010-04-07 150.87
75 79 2010-04-17 101.90
76 79 2010-05-02 111.90
and here's the code with aggregate call and adjustment for empty weeks:
for (cid in all_tt_cids) {
print_pnq('Getting statistics for cid', cid)
# Get row indices of the selected subset, for just this cid's records
I <- which(tt$customer_id==cid & tt$visit_date<="2011-03-31")
# (other code to compute other per-cid statistics)
# spend_per_week (mode;mean;sd)
# Aggregate spend_per_week, but beware this should be 0 for those week with no visits
spend_per_week <- data.frame(c(list('weekofperiod'=13:65), list('spendperweek'=0)) )
nonzero_spends_per_week <- aggregate(tt$visit_spend[I], list('weekofperiod'=weekofperiod(tt$visit_date[I])), FUN="sum")
for (i in 1:nrow(nonzero_spends_per_week)) {
spend_per_week[spend_per_week$weekofperiod==nonzero_spends_per_week[i,1],2] <- nonzero_spends_per_week[i,2]
}
colnames(spend_per_week)[2] <- 'spend_per_week'
# (code to compute and store per-cid statistics on spend_per_week)
}
Your biggest speed up is going to come if you replace the for loops. I can't quite tell from your example, because you overwrite each customer in the loop, but here's one way to do it if you want to keep the info for all subjects.
For testing, first define functions for the original method, and a new method without loops:
weekofperiod <- function(dt) {
as.numeric(format(as.Date(dt), "%W")) + 52 * (as.numeric(format(as.Date(dt), "%Y"))-2010)
}
FastMethod <- function(tt) {
tt$week = weekofperiod(tt$visit_date)
spend_per_week.tmp = as.data.frame(tapply(tt$visit_spend, tt[,c(1,4)], sum))
spend_per_week = data.frame(matrix(0, nrow=nrow(spend_per_week.tmp), ncol=length(13:65)))
colnames(spend_per_week) = 13:65
rownames(spend_per_week) = rownames(spend_per_week.tmp)
spend_per_week[, colnames(spend_per_week.tmp)] = spend_per_week.tmp
spend_per_week
}
OrigMethod <- function(tt) {
all_tt_cids = unique(tt$customer_id)
for (cid in all_tt_cids) {
# Get row indices of the selected subset, for just this cid's records
I <- which(tt$customer_id==cid & tt$visit_date<="2011-03-31")
# Aggregate spend_per_week, but beware this should be 0 for those week with no visits
spend_per_week <- data.frame(c(list('weekofperiod'=13:65), list('spendperweek'=0)))
nonzero_spends_per_week <- aggregate(tt$visit_spend[I], list('weekofperiod'=weekofperiod(tt$visit_date[I])), FUN="sum")
for (i in 1:nrow(nonzero_spends_per_week)) {
spend_per_week[spend_per_week$weekofperiod==nonzero_spends_per_week[i,1],2] <- nonzero_spends_per_week[i,2]
}
colnames(spend_per_week)[2] <- 'spend_per_week'
}
spend_per_week
}
Now simulate a larger dataset so it's easier to compare:
n.row = 10^4
n.cust = 10^3
customer_id = 1:n.cust
dates = seq(as.Date('2010-04-01'), as.Date('2011-03-31'), by=1)
visit_date = sample(dates, n.row, replace=T)
visit_spend = runif(n.row, 0, 200)
tt = data.frame(customer_id, visit_date, visit_spend)
Finally, compare the two methods:
> system.time(FastMethod(tt))
user system elapsed
0.082 0.001 0.083
> system.time(OrigMethod(tt))
user system elapsed
4.505 0.007 4.514
This is already 50x faster, and I bet you can make it even better with more optimization. Good luck!
Here is a faster method using data.table, which is also easier to read.
FasterMethod <- function(tt){
# LOAD LIBRARIES
require(reshape2)
require(data.table)
tt <- transform(tt, week_of_period = weekofperiod(visit_date))
# AGGREGATE SPEND BY CUSTOMER AND WEEK OF PERIOD
tt <- data.table(tt)
ans <- tt[,list(spend = sum(visit_spend)), 'customer_id, week_of_period']
# RESHAPE TO CUSTOMER ID VS. WEEK OF PERIOD
dcast(ans, customer_id ~ week_of_period, value_var = 'spend')
}
We can benchmark this against FastMethod and OrigMethod using rbenchmark, and see that we gain a 1.3x speedup over FastMethod and an overall speedup of 70x
library(rbenchmark)
benchmark(FastMethod(tt), FasterMethod(tt), replications = 40)
test elapsed relative
FastMethod(tt) 5.594 1.346654
FasterMethod(tt) 4.154 1.000000
You can speed it up even further (2.5 x compared to FastMethod) if you did not care about reshaping the final output to customer id vs. week of period.

Resources