Append to dataframe through for loop and function - r

When I run a code with save as csv at the end everything runs correctly. It means I filter my dataset by country do some continuations and save it as a country file.
When I try to rbind my datasets I receive nothing. I try different solutions which I found but nothing is working. Clearly I do not understand why I can not rbind.
The code:
library(dplyr)
library(readxl)
setwd("Z:/Reporting_Private/Tableau")
dataupl <- read_excel("Analysis Map_Tableau - Ready.xlsm", sheet = "Data")
df = dataupl
#select right columns
df = df[,1:6]
colnames(df)=c("Office", "Employee","Territiry","Sales","Leads","Act")
#change n/a to zero
df[is.na(df)]=0
countries = df %>% select(Office) %>% distinct()
countries = as.data.frame(countries)
engine <- function(input){
df = df %>% filter(Office==input)
SCALESALES = scale(df$Sales)
SCALELEADS= scale(df$Leads)
SCALEACT= scale(df$Act)
df = df %>% mutate(SCALESALES = SCALESALES, SCALELEADS = SCALELEADS, SCALEACT = SCALEACT)
df$SLegend = ave(df$Sales, df$SalesLegend, FUN = min)
df$SLegend = ifelse(df$SLegend>0, df$SLegend,0)
df$LLegend = ave(df$Leads, df$LeadsLegend, FUN = min)
df$ALegend = ave(df$Act, df$ActLegend, FUN = min)
#write.csv(final, file = paste0(input,".csv"))
dftotal = data.frame()
dftotal = rbind(dftotal,df)
}
for (i in 1:nrow(countries)){
input = countries[i,]
engine(input)
}

It seems that you can split your dataframe L <- split(df, df$Office) and then lapply(L, ...) Instead of write.csv(...) you have to return the dataframe: return(df) in your function.
So, something like this:
engine <- function(dfi) {
SCALESALES = scale(dfi$Sales)
SCALELEADS = scale(dfi$Leads)
SCALEACT = scale(dfi$Act)
dfi = dfi %>% mutate(SCALESALES = SCALESALES, SCALELEADS = SCALELEADS, SCALEACT = SCALEACT)
dfi$SLegend = ave(dfi$Sales, dfi$SalesLegend, FUN = min)
dfi$SLegend = ifelse(dfi$SLegend>0, dfi$SLegend, 0)
dfi$LLegend = ave(dfi$Leads, dfi$LeadsLegend, FUN = min)
dfi$ALegend = ave(dfi$Act, dfi$ActLegend, FUN = min)
return(dfi)
}
L <- split(df, df$Office)
Lnew <- lapply(L, engine)
dftotal <- Lnew[[1]]
for (i in 2:length(Lnew)) dftotal <- rbind(dftotal, Lnew[[i]])

Related

Vectorize Functions in R

I have this dataset in R that looks something like this:
id = sample.int(10000, 100000, replace = TRUE)
res = c(1,0)
results = sample(res, 100000, replace = TRUE)
date_exam_taken = sample(seq(as.Date('1999/01/01'), as.Date('2020/01/01'), by="day"), 100000, replace = TRUE)
my_data = data.frame(id, results, date_exam_taken)
my_data <- my_data[order(my_data$id, my_data$date_exam_taken),]
my_data$general_id = 1:nrow(my_data)
my_data$exam_number = ave(my_data$general_id, my_data$id, FUN = seq_along)
my_data$general_id = NULL
Using the R programming language, I wrote this loop that calculates conditional probabilities for the next exam results of a student conditional on the previous exam:
library(data.table)
setDT(my_data)
my_list = vector("list", length(unique(my_data$id)))
# Create an empty vector with pre-specified dimensions
my_vector = vector("list", 100)
for (i in 1:length(unique(my_data$id)))
{
tryCatch({
start_i = my_data[my_data$id == i,]
pairs_i = data.frame(first = head(start_i$results, -1), second = tail(start_i$results, -1))
frame_i = as.data.table(table(pairs_i))
frame_i[, id := i]
print(frame_i)
my_vector[[i]] = frame_i
}, error = function(e){})
}
final = rbindlist(my_vector)
I am now trying to "vectorize" this code for improved efficiency. Here is my attempt:
# I don't think I need to create a "list or vector" to store the results in advance?
my_vector = sapply(unique(my_data$id), function(i) {
{tryCatch({
setDT(my_data)
start_i = my_data[my_data$id == i,]
pairs_i = data.frame(first = head(start_i$results, -1), second = tail(start_i$results, -1))
frame_i = as.data.frame(table(pairs_i))
frame_i$i = i
print(frame_i)
return(frame_i)
}, error = function(e){print(paste("An error occurred:", e))})
}
})
# produced an error, so I tried a different code
final = rbindlist(my_vector, fill = TRUE)
# not sure if this fully worked either?
final = do.call(rbind.data.frame, my_vector)
Have I correctly "vectorized" this code?
UPDATE: Probability Calculation Template:
library(dplyr)
total_1 = final %>% group_by(first, second) %>% summarise(totals = n())
total_2 = total_1 %>% group_by(first) %>% summarise(sum = sum(totals))
join = merge(x = total_1, y = total_2, by = c("first"), all = TRUE)
join$probs = join$totals/join$sum
na.omit(join)
library(magrittr)
library(dplyr)
group_by(my_data, id)%>%
summarise(first = head(results, -1), second = tail(results, -1))%>%
ungroup

Optimizing large db merge using split() function

I need to perform a conceptually straightforward double left-merge followed by a simple series of matching functions (See: Straightforward Solution). However, given the DBs I have to merge are large in size I tried to unpack the merging procedure by considering a for-loop that does the trick but is inefficient to say the least (See: For-loops Solution).
Is there a solution splitting and naming at least the largest db?
Below there is a toy example.
For reference, in my data:
db_m1 ~50k lines (for ~5k unique m1)
db_m2 ~25k lines (for ~5k unique m1 and m2)
db_p ~100m lines
set.seed(0)
db_m1 <- data.frame(
y=rep(1,20),
id=sort(rep(paste0("id_",c(letters[1:4])),5)),
m1=rep(c(1,2),10),
x1=sample(LETTERS, 20, TRUE),
x2=sample(LETTERS, 20, TRUE))
set.seed(0)
db_m2 <- data.frame(y=rep(1,20),
m1=sample(c(1:5),20,TRUE),
m2=sample(c(6:10),20,TRUE))
set.seed(0)
db_p <- data.frame(m2=sample(c(6:10),100,TRUE),
y1=sample(LETTERS, 100,TRUE),
y2=sample(LETTERS, 10,TRUE))
Straightforward Solution :
final_dplyr <- db_m1 %>%
dplyr::left_join(db_m2) %>%
dplyr::left_join(db_p) %>%
dplyr::mutate(match_1=ifelse(x1==y1|x1==y2,1,0),
match_2=ifelse(x2==y1|x2==y2,1,0),
sum_matches=mapply(sum,match_1,match_2),
final_1 = ifelse(as.numeric(sum_matches)>=1,1,0),
final_2 = ifelse(as.numeric(sum_matches)>=2,1,0)) %>%
group_by(id,m2) %>%
dplyr::mutate(n_p=n(),
n_p=ifelse(all(is.na(y1)),NA,n_p)) %>%
group_by(y,id,m1,m2,n_p) %>%
dplyr::summarise(match_1=sum(match_1,na.rm = T),
match_2=sum(match_2,na.rm = T),
final_1 = sum(final_1),
final_2 = sum(final_2))
For-loops Solution:
fn_final <- function(db_m1,db_m2,db_p) {
matches_final <- vector("list",length = length(unique(db_m1$y)))
for(i in 1:length(unique(db_m1$y))){
matches <- vector("list",length = length(unique(db_m1$m1)))
for(j in 1:length(unique(db_m1$m1))){
temp_db_m1 <- db_m1 %>% dplyr::filter(y==unique(db_m1$y)[i], m1==unique(db_m1$m1)[j])
temp_db_m2 <- db_m2 %>% dplyr::filter(y==unique(db_m1$y)[i], m1==unique(db_m1$m1)[j])
m_vector <- unique(temp_db_m2$m2)
temp_db_p <- db_p %>%
dplyr::filter(m2 %in% m_vector)
final <- db_m1 %>%
dplyr::left_join(db_m2) %>%
dplyr::left_join(db_p) %>% dplyr::mutate(match_1=ifelse(x1==y1|x1==y2,1,0),
match_2=ifelse(x2==y1|x2==y2,1,0),
sum_matches=mapply(sum,match_1,match_2),
final_1 = ifelse(as.numeric(sum_matches)>=1,1,0),
final_2 = ifelse(as.numeric(sum_matches)>=2,1,0)) %>%
group_by(id,m2) %>%
dplyr::mutate(n_p=n(),
n_p=ifelse(all(is.na(y1)),NA,n_p)) %>%
group_by(y,id,m1,m2,n_p) %>%
dplyr::summarise(match_1=sum(match_1,na.rm = T),
match_2=sum(match_2,na.rm = T),
final_1 = sum(final_1),
final_2 = sum(final_2))
matches[[j]] <- final
}
matches_all <- do.call(rbind, matches)
matches_final[[i]] <- matches_all
}
final <- do.call(rbind, matches_final) %>%
dplyr::filter(!is.na(n_p)) %>%
unique()
return(final)
}
final_for <- fn_final(db_m1,db_m2,db_p)
This is a possible solution, should it be optimized further?
db_m1_s <- split(db_m1, f = list(db_m1$y,db_m1$m1))
db_m2_s <- split(db_m2, f = list(db_m2$y,db_m2$m1))
db_p_s <- split(db_p, f = list(db_p$m2))
match_fn <- function(temp_db_m1,temp_db_m2,temp_db_p){
final <- temp_db_m1 %>%
dplyr::left_join(temp_db_m2) %>%
dplyr::left_join(temp_db_p) %>%
dplyr::mutate(match_1=ifelse(x1==y1|x1==y2,1,0),
match_2=ifelse(x2==y1|x2==y2,1,0),
sum_matches=mapply(sum,match_1,match_2),
final_1 = ifelse(as.numeric(sum_matches)>=1,1,0),
final_2 = ifelse(as.numeric(sum_matches)>=2,1,0)) %>%
group_by(id,m2) %>%
dplyr::mutate(n_p=n(),
n_p=ifelse(all(is.na(y1)),NA,n_p)) %>%
group_by(y,id,m1,m2,n_p) %>%
dplyr::summarise(match_1=sum(match_1,na.rm = T),
match_2=sum(match_2,na.rm = T),
final_1 = sum(final_1),
final_2 = sum(final_2))
return(final)
}
fn_final <- function(db_m1,db_m1_s,db_m2_s,db_p_s) {
m <- names(db_m1_s)
matches_1 <- vector("list",length = length(m))
for(i in 1:length(m)){
temp_db_m1 <- db_m1_s[[m[i]]]
temp_db_m2 <- db_m2_s[[m[i]]]
n <- as.character(sort(unique(temp_db_m2$m2)))
matches_2 <- vector("list",length = length(n))
for(j in 1:length(n)){
temp_db_p <- db_p_s[[n[j]]]
final <- match_fn(temp_db_m1,temp_db_m2,temp_db_p)
matches_2[[j]] <- final
}
matches_all <- do.call(rbind, matches_2)
matches_1[[i]] <- matches_all
}
matches_0 <- do.call(rbind, matches_1) %>%
dplyr::filter(!is.na(n_p)) %>%
unique()
return(matches_0)
}
final_for <- fn_final(db_m1,db_m1_s,db_m2_s,db_p_s)

Apply a function to multiple datasets using lapply

I have a large number of datasets for which I want to create the same variable. I would like to create a function to avoid having to repeat the same code many times.
I tried the code below: the first 3 lines describe the creation of the variable that I am trying to apply through the function created below.
data1 <- data1 %>%
dplyr::group_by(id)%>%
dplyr::mutate(new_var = sum(score))
list_data <- c(data1, data2, data3)
my_func <- function(x) {
x <- x %>%
dplyr::group_by(id) %>%
dplyr::mutate(new_var = sum(score))
}
lapply(list_data, my_func)
I obtain the error message
no applicable method for 'group_by' applied to an object of class
"character".
Could you please help me figure this out?
for me this works fine:
my_func <- function(x) {
x <- x %>%
dplyr::group_by(id) %>%
dplyr::mutate(new_var = sum(score))
}
data1 <- data.frame(id = rep(1:3, each = 3), score = 1:9)
data2 <- data.frame(id = rep(1:3, each = 3), score = 11:19)
data3 <- data.frame(id = rep(1:3, each = 3), score = 21:29)
list_data <- list(data1, data2, data3)
lapply(list_data, my_func)

Create line density per polygon using tidyverse + sf R

I have a GIS question that has been stumping me for some time now. The end goal would be to extract the density of lines per pixel/voxel/polygon using tidyverse/sf packages. As of now I have a function that works when I execute line-by-line, but not as a function. The ultimate end-goal would be to use this function in sfLappy of the snowfall package to be run in parallel. Any help on getting this to work as a function would be greatly appreciated! The data involved can be found here....
https://www.dropbox.com/s/zg9o2b5x4wizafo/hexagons.gpkg?dl=0
https://www.dropbox.com/s/x2gxx36pjkutxzm/railroad_lines.gpkg?dl=0
The function that I had created, which, again, works line-for-line but not as a function, can be found here:
length_in_poly <- function(fishnet, spatial_lines) {
require(sf)
require(tidyverse)
require(magrittr)
fishnet <- st_as_sf(do.call(rbind, fishnet))
spatial_lines <- st_as_sf(do.call(rbind, spatial_lines))
fish_length <- list()
for (i in 1:nrow(fishnet)) {
split_lines <- spatial_lines %>%
st_cast(., "MULTILINESTRING", group_or_split = FALSE) %>%
st_intersection(., fishnet[i, ]) %>%
mutate(lineid = row_number())
fish_length[[i]] <- split_lines %>%
mutate(length = sum(st_length(.)))
}
fish_length <- do.call(rbind, fish_length) %>%
group_by(hexid4k) %>%
summarize(length = sum(length))
fishnet <- fishnet %>%
st_join(., fish_length, join = st_intersects) %>%
mutate(hexid4k = hexid4k.x,
length = ifelse(is.na(length), 0, length),
pixel_area = as.numeric(st_area(geom)),
density = length/pixel_area)
}
To prep the data:
library(sf)
library(tidyverse)
library(snowfall)
input_hexagons <- st_read("hexagons.gpkg")
input_rail_lines <- st_read("railroad_lines.gpkg")
Using some code from here:
faster_as_tibble <- function(x) {
structure(x, class = c("tbl_df", "tbl", "data.frame", "sfc"), row.names = as.character(seq_along(x[[1]])))
}
split_fast_tibble <- function (x, f, drop = FALSE, ...) {
lapply(split(x = seq_len(nrow(x)), f = f, ...),
function(ind) faster_as_tibble(lapply(x, "[", ind)))
}
Create a state-wise list:
sub_hexnet <- split_fast_tibble(input_hexagons, input_hexagons$STUSPS) %>%
lapply(st_as_sf)
Finally, to run just as a single-core process:
test <- lapply(fishnet = as.list(sub_hexnet),
FUN = length_in_poly,
spatial_lines = input_rail_lines)
Or, in the perfect world, a multi-core process:
sfInit(parallel = TRUE, cpus = parallel::detectCores())
sfExport(list = c("sub_hexnet", "mask_rails"))
extractions <- sfLapply(fishnet = sub_hexnet,
fun = length_in_poly,
spatial_lines = input_rail_lines)
sfStop()
Thanks in advance for any help - I am completely stumped!
After messing around with this for a while I finally figured out a solution.
The key helper functions used:
load_data <- function(url, dir, layer, outname) {
file <- paste0(dir, "/", layer, ".shp")
if (!file.exists(file)) {
download.file(url, destfile = paste0(dir, ".zip"))
unzip(paste0(dir, ".zip"),
exdir = dir)
unlink(paste0(dir, ".zip"))
}
name <- paste0(outname, "_shp")
name <- sf::st_read(dsn = dir, layer = layer)
name
}
get_density <- function(x, grids, lines) {
require(tidyverse)
require(lubridate)
require(sf)
sub_grids <- grids %>%
dplyr::filter(hexid4k == x)
single_lines_hexid <- lines %>%
dplyr::filter(hexid4k == x) %>%
sf::st_intersection(., sub_grids) %>%
dplyr::select(hexid4k, STUSPS) %>%
dplyr::mutate(length_line = st_length(.),
length_line = ifelse(is.na(length_line), 0, length_line))
sub_grids <- sub_grids %>%
sf::st_join(., single_lines_hexid, join = st_intersects) %>%
dplyr::mutate(hexid4k = hexid4k.x) %>%
dplyr::group_by(hexid4k) %>%
dplyr::summarize(length_line = sum(length_line)) %>%
dplyr::mutate(pixel_area = as.numeric(st_area(geom)),
density = length_line/pixel_area) %>%
dplyr::select(hexid4k, length_line, density, pixel_area)
return(sub_grids)
}
Prep the input data:
usa_shp <- load_data(url = "https://www2.census.gov/geo/tiger/GENZ2016/shp/cb_2016_us_state_20m.zip",
dir = 'data',
layer = "cb_2016_us_state_20m",
outname = "usa") %>%
sf::st_transform(p4string_ea) %>%
dplyr::filter(!STUSPS %in% c("HI", "AK", "PR"))
usa_shp$STUSPS <- droplevels(usa_shp$STUSPS)
hex_points <- spsample(as(usa_shp, 'Spatial'), type = "hexagonal", cellsize = 4000)
hex_grid <- HexPoints2SpatialPolygons(hex_points, dx = 4000)
hexnet_4k <- st_as_sf(hex_grid) %>%
mutate(hexid4k = row_number()) %>%
st_intersection(., st_union(usa_shp)) %>%
st_join(., usa_shp, join = st_intersects) %>%
dplyr::select(hexid4k, STUSPS)
transmission_lines_hex <- load_data( url = "https://hifld-dhs-gii.opendata.arcgis.com/datasets/75af06441c994aaf8e36208b7cd44014_0.zip",
dir = 'data',
layer = 'Electric_Power_Transmission_Lines',
outname = 'tl')%>%
dplyr::select(LINEARID, STUSPS) %>%
st_join(., hexnet_4k, join = st_intersects) %>%
mutate(STUSPS = STUSPS.x) %>%
dplyr::select(LINEARID, hexid4k, STUSPS)
The resulting parallel process is as follows:
hexnet_list <- hexnet_4k %>%
split(., .$STUSPS)
sfInit(parallel = TRUE, cpus = num_cores)
sfExport('transmission_lines_hex')
sfSource('src/functions/helper_functions.R')
transmission_lines_density <- lapply(hexnet_list,
function (input_list) {
require(tidyverse)
require(magrittr)
require(lubridate)
require(lubridate)
require(sf)
sub_grid <- dplyr:::bind_cols(input_list)
unique_ids <- unique(sub_grid$hexid4k)
state_name <- unique(sub_grid$STUSPS)[1]
print(paste0('Working on ', state_name))
got_density <- lapply(unique_ids,
FUN = get_density,
grids = sub_grid,
lines = transmission_lines_hex)
print(paste0('Finishing ', state_name))
return(got_density)
}
)
sfStop()
I hope some of this may be useful to you and, as always, suggestions on optimization would be welcomed.

How to run a changepoint analysis on multiple time series with dplyr

I'm collecting time series data from Wikipedia and want to run a change-point analysis on each time series using dplyr. But when I do so I get an error saying the data need to be numeric, even though the class function states it is numeric. Hope you can help.
library(changepoint)
library(dplyr)
library(pageviews)
library(data.table)
articles <- c("Rugby_union", "Football")
foo <- function(x){article_pageviews(project = "en.wikipedia",
article = x,
start = as.Date('2017-01-01'),
end = as.Date("2017-12-31")
, user_type = "user", platform = c("mobile-web"))
}
output<-articles %>% foo
output %>%
select(article, views) %>%
do(cpt.mean(.))
class(output$views)
library(changepoint)
library(dplyr)
library(pageviews)
articles <- c("Rugby_union", "Football")
foo <- function(x){article_pageviews(project = "en.wikipedia", article = x,
start = as.Date('2017-01-01'),
end = as.Date("2017-12-31"),
user_type = "user", platform = c("mobile-web"))
}
output <- articles %>%
foo
df <- as.data.frame(table(output$article))
output1 <- output %>%
dplyr::select(article, views) %>%
dplyr::filter(article == df[1,1])
output2 <- output %>%
dplyr::select(article, views) %>%
dplyr::filter(article == df[2,1])
q <- floor((min(length(output1$views), length(output2$views)))/2 + 1)
cp1 <- changepoint::cpt.mean(data = output1$views, Q = q, method = "BinSeg", penalty
= "SIC")
plot(cp1)
cp2 <- changepoint::cpt.mean(data = output2$views, Q = q, method = "BinSeg", penalty
= "SIC")
plot(cp2)

Resources