Extremely slow calculations in SPARK (using sparklyr) - r

I'm using sparklyr library in order to calculate some metrics on big data. I've decide to test my code on a small portion of data (parquet-table format ~ 1,3m rows, 40MB), but after executing some basic filtering SPARK needs approximately 2-3 minutes to calculate sum.
Here is an example of my code
#Connecting
df <- spark_read_parquet(spark_conn, name = "df", path = "hdfs://****.db/table", header = TRUE, memory = FALSE, repartition = 1, overwrite = FALSE)
#Basic filtering
df <- df %>%
mutate (ZD = datediff(Maturity_Date - Treaty_Date))
df <- df %>%
filter(Treaty_Amount <= 30000, ZD <= 30)
df <- df %>%
dplyr::mutate(MD_Received_From_Borrower_2018_1 = rowSums(.[139:150], na.rm = TRUE))
df <- df %>%
dplyr::mutate(PD_Received_From_Borrower_2018_1 = rowSums(.[175:186], na.rm = TRUE))
df <- df %>%
mutate (Total_Amount_Received_From_Borrower_2018_1= (MD_Received_From_Borrower_2018_1 + PD_Received_From_Borrower_2018_1))
#More filtering for a project purposes
df <- df %>%
mutate(Preditog1_2018_01 = ifelse((Debt_Duration_As_Of_31.01.2018 >= 361 & Debt_Duration_As_Of_31.01.2019 == 0 & Total_Amount_Received_From_Borrower_2018_1 == 0), 361, 0))
df <- df %>%
mutate(Preditog2_2018_01 = ifelse((Date_Of_Debt_Sale <= "2019-01-31" & !is.na(Date_Of_Debt_Sale)), 361, 0))
df <- df %>%
mutate(Preditog3_2018_01 = ifelse((Date_Of_Debt_Cancellation <= "2019-01-31" & !is.na(Date_Of_Debt_Cancellation )), 361, 0))
df <- df %>%
mutate(Preditog_2018_01 = ifelse((Preditog1_2018_01 == 361 | Preditog2_2018_01 == 361 | Preditog3_2018_01 == 361), 361, 0))
#df <- df %>%
mutate(Itog_2018_01 = ifelse(Preditog_2018_01 == 361, 361, Debt_Duration_As_Of_31.01.2019))
#Calculations (Where the process is extremely slow especially when calculating sum)
##Extra filtering
df <- df %>%
mutate(f_0_v_360_2018_01_d = ifelse((Debt_Duration_As_Of_31.01.2018 == 0 & Itog_2018_01 > 360), Main_Debt_As_Of_31.01.2018, 0))
##Calculating sum
df %>%
summarise(res1 = sum(f_0_v_360_2018_01_d)
Maybe I'm doing smth wrong?
How is it possible to optimize calculation speed?
Thank you in advance for any help!

Related

Macro function in R

I'm new to R coming from SAS. I am trying to repeat this code for a set of datasets called mort_1969, mort_1970,..., mort_n. How can I modify it so I don't have to rerun it each time, replacing the year variable?
mrt <- read_csv("mort1985.csv")
mrt <- mrt %>%
rename(nchs_code = countyrs)
crosswalk <- read_csv("crosswalk.csv")
crosswalk <- crosswalk %>%
select(-X1)
mrt_cw <- left_join(mrt, crosswalk, by = "nchs_code")
mrt_cw1 <- mrt_cw %>%
filter(cityrs == 999) %>%
select(fips, racer3) %>%
mutate(count = 1) %>%
group_by(fips, racer3) %>%
summarise(mrt_c = sum(count)) %>%
mutate(racer3=replace(racer3, racer3==1, "white_pop"),
racer3=replace(racer3, racer3==2, "black_pop"),
racer3=replace(racer3, racer3==3, "other_race_pop"))
mrt_cw2 <- pivot_wider(
data=mrt_cw1,
names_from = racer3,
values_from = mrt_c
)
#Convert NAs to 0 and drop original variables
mrt_cw3 <- mrt_cw2 %>%
mutate(white_m = ifelse(is.na(white_pop), 0, white_pop),
black_m = ifelse(is.na(black_pop), 0, black_pop),
other_race_m = ifelse(is.na(other_race_pop), 0, other_race_pop)) %>%
select(-white_pop, -black_pop, -other_race_pop) %>%
mutate(total_m = white_m + black_m + other_race_m)
pop <- read_csv('population_file.csv')
##########################ADD YEAR
pop <- pop %>%
filter(year == 1985) %>%
select(-X1, -year)
mrt_cw4 <- left_join(mrt_cw3, pop, by = "fips")
mrt_cw5 <- mrt_cw4 %>%
mutate(ttl_rate = (total_m/total_pop)*100,
blk_rate = (black_m/black_pop)*100,
wht_rate = (white_m/white_pop)*100,
otr_rate = (other_race_m/other_races_pop)*100)
mrt_cw6 <- mrt_cw5 %>%
mutate(high_flag = ifelse(ttl_rate >= 100 | blk_rate >= 100 |
wht_rate >= 100 | otr_rate >= 100, 1, 0))
##########################ADD YEAR
#Save file
write.csv(mrt_cw6, "mort_85.csv")

Group data by year and filter by month in R

I have a list of data frames with daily streamflow data.
I want to estimate the maximum daily flow from June to November every year for each data frame in the list that corresponds each of them to data in a station.
This is how the list of data frames looks:
and this is the code I am using:
#Peak mean daily flow summer and fall (June to November)
PeakflowSummerFall <- lapply(listDF,function(x){x %>% group_by(x %>% mutate(year = year(Date)))
%>% filter((x %>% mutate(month = month(Date)) >= 6) & (x %>% mutate(month = month(Date)) <= 11))
%>% summarise(max=max(DailyStreamflow, na.rm =TRUE))})
but I am having this error:
<error/dplyr_error>
Problem with `filter()` input `..1`.
x Input `..1` must be of size 1, not size 24601.
i Input `..1` is `&...`.
i The error occurred in group 1: Date = 1953-06-01, DailyStreamflow = 32, year = 1953.
Backtrace:
Run `rlang::last_trace()` to see the full context
Any solution to this problem?
#### This should give provide you with enough
#### sample data for answerers to work with
install.packages('purrr')
library(purrr)
sample_dat <- listDF %>%
head %>%
map( ~ head(.x))
dput(sample_dat)
#### With that being said...
#### You should flatten the data frame...
#### It's easier to work with...
install.packages('lubridate')
library(lubridate)
listDF %>%
plyr::ldply(rbind) %>%
mutate(month = floor_date(Date, unit = 'month')) %>%
filter(month(Date) > 5, month(Date) < 12) %>%
group_by(.id, month) %>%
dplyr::summarise(max_flow = max(DailyStreamflow)) %>%
split(.$.id)
Given the posted image of the data structure, the following might work.
library(lubridate)
library(dplyr)
listDF %>%
purrr::map(function(x){
x %>%
filter(month(Date) >= 6 & month(Date) <= 11) %>%
group_by(year(Date)) %>%
summarise(Max = max(DailyStreamflow, na.rm = TRUE), .groups = "keep")
})
Test data creation code.
fun <- function(year, n){
d1 <- as.Date(paste(year, 1, 1, sep = "-"))
d2 <- as.Date(paste(year + 10, 12, 31, sep = "-"))
d <- seq(d1, d2, by = "day")
d <- sort(rep(sample(d, n, TRUE), length.out = n))
flow <- sample(10*n, n, TRUE)
data.frame(Date = d, DailyStreamflow = flow)
}
set.seed(2020)
listDF <- lapply(1:3, function(i) fun(c(1953, 1965, 1980)[i], c(24601, 13270, 17761)[i]))
str(listDF)
rm(fun)

dplyr::summarise pull the value based on another column max

Based on below reproducible code, how to add Address column conditionally based on max(LeastNEmployees):
dat_url <- "https://gender-pay-gap.service.gov.uk/viewing/download-data/2019"
dat <- read_csv(dat_url)
#2 convert EmployerSize
df = data.frame(EmployerSize=c('Less than 250','250 to 499', '500 to 999', '1000 to 4999', '5000 to 19,999', '20,000 or more'),
LeastNEmployees = c(1,250,500, 1000, 5000, 20000))
a <- dat %>%
left_join(df, c('EmployerSize' = 'EmployerSize')) %>%
group_by(ResponsiblePerson) %>%
summarize(
across(where(is.numeric) & !starts_with("Least"), mean),
across(c("EmployerName","SicCodes"), ~toString(.x)),
LeastNEmployees = max(LeastNEmployees))
Here is one to do it with a which condition.
a <- dat %>%
left_join(df, c('EmployerSize' = 'EmployerSize')) %>%
group_by(ResponsiblePerson) %>%
summarize(
across(where(is.numeric) & !starts_with("Least"), mean),
across(c("EmployerName","SicCodes"), ~toString(.x)),
LeastNEmployees = max(LeastNEmployees),
Address = Address[which(LeastNEmployees == max(LeastNEmployees))])

How to delete specific record of one dataframe according to values in another dataframe in R

I have two dataframe (dat1 & dat2). Some records in dat2 need to be deleted, according to if var1 in dat1 is negative. I use the following codes, but I think they are not the best one, because I use an extra temporary dataframe tmp. Could we have a better method?
library(dplyr)
Date1 <- c("1999-12-17", "2005-1-5", "2003-11-2", "2005-6-12", "2005-8-9")
Date1 <- as.POSIXct(Date1, tz = "UTC")
Date2 <- c("2005-1-5", "2005-6-12", "2005-8-9")
Date2 <- as.POSIXct(Date2, tz = "UTC")
var1 <- c(-3, -10, 9, 5, 8)
var2 <- c(0.2, 0.6, 0.15)
dat1 <- data.frame(Date1, var1)
dat2 <- data.frame(Date2, var2)
#Below is what I did
tmp <- inner_join(dat1, dat2, by = c("Date1" = "Date2"))
tmp <- tmp[-tmp$var1 < 0, ]
dat2 <- tmp[, c(1,3)]
Something like this should work:
dat2 %>%
left_join(dat1, by = c("Date2" = "Date1")) %>%
filter(var1 > 0) %>%
mutate(var1 = NULL)
Given you're already using dplyr, why not make better use of pipes, filter, and select as such
library(dplyr)
dat2 %>%
left_join(dat1, by = c("Date2" = "Date1")) %>%
filter(var1 >= 0) %>%
select(-var1)

How to get formulas of multiple regressions by vectorizing

Suppose I have the following code that makes multiple regressions and stores the lm and lm with stepwise selection models in tibbles:
library(dplyr)
library(tibble)
library(MASS)
set.seed(1)
df <- data.frame(A = sample(3, 10, replace = T),
B = sample(100, 10, replace = T),
C = sample(100, 10, replace = T))
df <- df %>% arrange(A)
formula_df <- as.tibble(NA)
aic_df <- as.tibble(NA)
for (i in unique(df$A)){
temp <- df %>% filter(A == i)
formula_df[i, 1] <- temp %>%
do(model = lm(B ~ C, data = .))
aic_df[i, 1] <- temp %>%
do(model = stepAIC(formula_df[[1,1]], direction = "both", trace = F))
}
Is it possible to vectorize to make it faster, for example using the *pply functions? The loop becomes extremely slow when the data gets larger. Thank you in advance.
You could try something like:
model <- df %>% group_by(A) %>%
summarise(formula_model = list(lm(B ~ C))) %>%
mutate(aic_model = list(stepAIC(.[[1,2]], direction = "both", trace = F)))

Resources