Not summarizing data correctly in R [duplicate] - r

I am using the following code to summarize my data by a column
library(data.table, warn.conflicts = FALSE)
library(lubridate, warn.conflicts = FALSE)
################
## PARAMETERS ##
################
# Set path of major source folder for raw transaction data
in_directory <- "C:/Users/NAME/Documents/Raw Data/"
# List names of sub-folders (currently grouped by first two characters of CUST_ID)
in_subfolders <- list("AA-CA", "CB-HZ")
# Set location for output
out_directory <- "C:/Users/NAME/Documents/YTD Master/"
out_filename <- "OUTPUT.csv"
# Set beginning and end of date range to be collected - year-month-day format
date_range <- interval(as.Date("2017-01-01"), as.Date("2017-01-31"))
# Enable or disable filtering of raw files to only grab items bought within certain months to save space.
# If false, all files will be scanned for unique items, which will take longer and be a larger file.
date_filter <- TRUE
##########
## CODE ##
##########
starttime <- Sys.time()
mastertable <- NULL
for (j in 1:length(in_subfolders)) {
subfolder <- in_subfolders[j]
sub_directory <- paste0(in_directory, subfolder, "/")
## IMPORT DATA
in_filenames <- dir(sub_directory, pattern =".txt")
for (i in 1:length(in_filenames)) {
# Default value provided for when fast filtering is disabled.
read_this_file <- TRUE
# To fast filter the data, we choose to include or exclude an entire file based on the date of its first line.
# WARNING: This is only a valid method if filtering by entire months, since that is the amount of data housed in each file.
if (date_filter) {
temptable <- fread(paste0(sub_directory, in_filenames[i]), colClasses=c(CUSTOMER_TIER = "character"),
na.strings = "", nrows = 1)
temptable[, INVOICE_DT := as.Date(INVOICE_DT)]
# If date matches, set read flag to TRUE. If date does not match, set read flag to FALSE.
read_this_file <- temptable[, INVOICE_DT] %within% date_range
}
if (read_this_file) {
print(Sys.time()-starttime)
print(paste0("Reading in ", in_filenames[i]))
temptable <- fread(paste0(sub_directory, in_filenames[i]), colClasses = c(CUSTOMER_TIER = "character"),
na.strings = "")
temptable <- temptable[,lapply(.SD, sum), by = .(CUST_ID),
.SDcols = c("Ext Sale")]
# Combine into full list
mastertable <- rbindlist(list(mastertable, temptable), use.names = TRUE)
# Release unneeded memory
rm(temptable)
}
}
}
# Save Final table
print("Saving master table")
fwrite(mastertable, paste0(out_directory, out_filename))
rm(mastertable)
print(Sys.time()-starttime)
The output i receive after running the above script for the month of January is as below and this is the output I expect.
CUST_ID Ext Sale
AK0010001 209.97
CO0020001 1540.3
The problem arises when i use multiple months. Below is the output I receive when I run Jan-Feb date_range <- interval(as.Date("2017-01-01"), as.Date("2017-02-28"))
CUST_ID Ext Sale
AK0010001 209.97
AK0010001 217.833
CO0020001 1540.3
CO0010001 -179.765
As you can see in the output above the CUST_ID is no longer consolidating.
Does anyone know why this would be happening?
Below I have provided some data to reproduce what I am working with. Just save the files into 4 separate text file and into folders like I have it in my code.
I have 2 separate folders saved as "AA-CA" and "CB-HZ"
File 1 saved as "AA-CA 2017-01.txt"
INVOICE_DT,BRANCH_CODE,INVOICE_NO,INV_SEQ_NO,INV_ITEM_ID,ITEM_DESCR,STD_ITEM,PRIVATE_LABEL,CATEGORY_PATH1,CATEGORY_PATH2,CUST_ID,CUSTOMER_TIER,IS_VENDING,SALE_PRICE,TOTAL_COST,POS_COST,CE100,CE110,CE120,CE200,CORP_PRICE,QTY_SOLD,PACKSLIP_WHSL,PRICING_GROUP,PGG_MIN_PRICE,PGY_MIN_PRICE,PGR_MIN_PRICE,Ext Sale,Ext Total Cost
2017-01-27,AK001,AK0016997,4,12772-00079,"3.75"""""""" 4.12"""""""" HOSE OD",N,N,08.5-Fleet & Automotive,01.6-DOT Hose & Tubing,AK0010001,Tier 3,No,42.74,22.438335,22.438335,21.37,,,0,,3,,PGR,168.2875125,134.63001,112.191675,128.22,67.315005
2017-01-27,AK001,AK0016997,3,12772-00022,"2.5"""""""" 2.87"""""""" HOSE OD C",N,N,08-Hydraulics & Pneumatics,02-Hose and Hose Reels,AK0010001,Tier 3,No,27.25,14.143396,14.143396,13.47,,,0,,3,,PGR,106.07547,84.860376,70.71698,81.75,42.430188
File 2 saved as "AA-CA 2017-02.txt"
INVOICE_DT,BRANCH_CODE,INVOICE_NO,INV_SEQ_NO,INV_ITEM_ID,ITEM_DESCR,STD_ITEM,PRIVATE_LABEL,CATEGORY_PATH1,CATEGORY_PATH2,CUST_ID,CUSTOMER_TIER,IS_VENDING,SALE_PRICE,TOTAL_COST,POS_COST,CE100,CE110,CE120,CE200,CORP_PRICE,QTY_SOLD,PACKSLIP_WHSL,PRICING_GROUP,PGG_MIN_PRICE,PGY_MIN_PRICE,PGR_MIN_PRICE,Ext Sale,Ext Total Cost
2017-02-28,AK001,AK0017107,1,12772-00307,3-WAY MALE HOUSING,N,N,09-Electrical,05.5-Terminals and Wire Connectors,AK0010001,Tier 3,No,95.21,74.591453,74.591453,71.04,,,0,,1,,PGG,0,0,0,95.21,74.591453
2017-02-28,AK001,AK0017105,3,99523968,PC58570 1/2 PRS BALL,Y,N,,,AK0010001,Tier 3,No,24.5246,12.356039,12.356039,11.767743,,,0,,5,,PGG,0,0,0,122.623,61.780195
File 3 saved as "CB-HZ 2017-01.txt"
INVOICE_DT,BRANCH_CODE,INVOICE_NO,INV_SEQ_NO,INV_ITEM_ID,ITEM_DESCR,STD_ITEM,PRIVATE_LABEL,CATEGORY_PATH1,CATEGORY_PATH2,CUST_ID,CUSTOMER_TIER,IS_VENDING,SALE_PRICE,TOTAL_COST,POS_COST,CE100,CE110,CE120,CE200,CORP_PRICE,QTY_SOLD,PACKSLIP_WHSL,PRICING_GROUP,PGG_MIN_PRICE,PGY_MIN_PRICE,PGR_MIN_PRICE,Ext Sale,Ext Total Cost
2017-01-31,CO002,CO0023603,19,13117-00095,8-32X5/16 BHSCS MAG,N,N,18-Work Order Parts,Finished Products,CO0020001,Tier 3,No,0.1858,0.037528,0.037528,0.01833,,,0,,6000,,PGG,0,0,0,1114.8,225.168
2017-01-31,CO002,CO0023603,20,13117-00186,"#8-16X3/4"""""""" 6-LOBE PA",N,N,01-Fasteners,03-Screws,CO0020001,Tier 3,No,0.0851,0.029652,0.029652,,,,0,,5000,,PGG,0,0,0,425.5,148.26
File 4 saved as "CB-HZ 2017-02.txt"
INVOICE_DT,BRANCH_CODE,INVOICE_NO,INV_SEQ_NO,INV_ITEM_ID,ITEM_DESCR,STD_ITEM,PRIVATE_LABEL,CATEGORY_PATH1,CATEGORY_PATH2,CUST_ID,CUSTOMER_TIER,IS_VENDING,SALE_PRICE,TOTAL_COST,POS_COST,CE100,CE110,CE120,CE200,CORP_PRICE,QTY_SOLD,PACKSLIP_WHSL,PRICING_GROUP,PGG_MIN_PRICE,PGY_MIN_PRICE,PGR_MIN_PRICE,Ext Sale,Ext Total Cost
2017-02-03,CO001,CO0019017,1,MN2550000A20000,M6-1.0 HEX NUT A-2,Y,N,01-Fasteners,04-Nuts,CO0010001,NA,No,0.0313,0.00767,0.00767,0.006215,0.000593,,0.001241,,-50,0.1058,,,,,-1.565,-0.3835
2017-02-16,CO001,CO0019018,1,11516769,RS37518BlkRndSpacer,Y,N,01.5-Hardware,Electronic Hardware,CO0010001,NA,No,0.0396,0.011245,0.011245,0.01071,,,0,,-4500,0.0543,,,,,-178.2,-50.6025
I have the data saved in 2 separate folders.

The OP is wondering why the result is not consolidated for CUST_ID if more than one month of data is processed.
The reason is that the monthly files are read in and aggregated one by one but a final aggregation step is needed to consolidate over all months.
The code below is a simplified replacement of the double for loops. I have left out the code for testing for "fast filtering".
The first part creates a list of files to be processed. The second part does the processing.
# create vector of filenames to be processed
in_filenames <- list.files(
file.path(in_directory, in_subfolders),
pattern = "\\.txt$",
full.names = TRUE,
recursive = TRUE)
# read and aggregate each file separately
mastertable <- rbindlist(
lapply(in_filenames, function(fn) {
# code for "fast filter" test goes here
message("Reading in ", fn)
temptable <- fread(fn,
colClasses = c(CUSTOMER_TIER = "character"),
na.strings = "")
# aggregate
temptable[, lapply(.SD, sum), by = .(CUST_ID), .SDcols = c("Ext Sale")]
})
)[
# THIS IS THE MISSING STEP:
# second aggregation for overall totals
, lapply(.SD, sum), by = .(CUST_ID), .SDcols = c("Ext Sale")]
Processing file: Raw Data/AA-CA/AA-CA 2017-01.txt
Processing file: Raw Data/AA-CA/AA-CA 2017-02.txt
Processing file: Raw Data/CB-HZ/CB-HZ 2017-01.txt
Processing file: Raw Data/CB-HZ/CB-HZ 2017-02.txt
mastertable
CUST_ID Ext Sale
1: AK0010001 427.803
2: CO0020001 1540.300
3: CO0010001 -179.765
Note that chaining of data.table expressions is used here.
Edit 1:
By request of the OP, here is the complete code (except for the "fast filtering" stuff). There are some additional lines which where modified. They are marked with ### MODIFIED.
library(data.table, warn.conflicts = FALSE)
library(lubridate, warn.conflicts = FALSE)
################
## PARAMETERS ##
################
# Set path of major source folder for raw transaction data
in_directory <- "Raw Data" ### MODIFIED
# List names of sub-folders (currently grouped by first two characters of CUST_ID)
in_subfolders <- list("AA-CA", "CB-HZ")
# Set location for output
out_directory <- "YTD Master" ### MODIFIED
out_filename <- "OUTPUT.csv"
# Set beginning and end of date range to be collected - year-month-day format
date_range <- interval(as.Date("2017-01-01"), as.Date("2017-02-28")) ### MODIFIED
# Enable or disable filtering of raw files to only grab items bought within certain months to save space.
# If false, all files will be scanned for unique items, which will take longer and be a larger file.
date_filter <- TRUE
##########
## CODE ##
##########
starttime <- Sys.time()
# create vector of filenames to be processed
in_filenames <- list.files(
file.path(in_directory, in_subfolders),
pattern = "\\.txt$",
full.names = TRUE,
recursive = TRUE)
# read and aggregate each file separetely
mastertable <- rbindlist(
lapply(in_filenames, function(fn) {
# code for fast filter test goes here
message("Processing file: ", fn)
temptable <- fread(fn,
colClasses = c(CUSTOMER_TIER = "character"),
na.strings = "")
# aggregate by month
temptable[, lapply(.SD, sum), by = .(CUST_ID), .SDcols = c("Ext Sale")]
})
)[
# second aggregation overall
, lapply(.SD, sum), by = .(CUST_ID), .SDcols = c("Ext Sale")]
# Save Final table
print("Saving master table")
fwrite(mastertable, paste0(out_directory, out_filename))
# rm(mastertable) ### MODIFIED
print(Sys.time()-starttime)
Edit 2
The OP has asked to include the "fast filter" code which I had omitted for brevity.
However, I have a different approach. Instead of reading the first line of each file to check if INVOICE_DT is within the given date_range my approach filters the file names. The file names contain the year-month in ISO 8601 format.
So, a vector of allowed year-month strings is constructed from the given date_range. Only those file names which contain one of the allowed year-month strings are selected for further processing.
However, selecting the proper files is only the first step. As the date-range may start or end right in the middel of a month, we need also to filter the rows of each processed file. This step is missing from OP's code.
library(data.table, warn.conflicts = FALSE)
library(magrittr) ### MODIFIED
# library(lubridate, warn.conflicts = FALSE) ### MODIFIED
################
## PARAMETERS ##
################
# Set path of major source folder for raw transaction data
in_directory <- "Raw Data" ### MODIFIED
# List names of sub-folders (currently grouped by first two characters of CUST_ID)
in_subfolders <- list("AA-CA", "CB-HZ")
# Set location for output
out_directory <- "YTD Master" ### MODIFIED
out_filename <- "OUTPUT.csv"
# Set beginning and end of date range to be collected - year-month-day format
date_range <- c("2017-01-01", "2017-02-14") ### MODIFIED
# Enable or disable filtering of raw files to only grab items bought within certain months to save space.
# If false, all files will be scanned for unique items, which will take longer and be a larger file.
# date_filter <- TRUE ### MODIFIED
##########
## CODE ##
##########
starttime <- Sys.time()
# create vector of filenames to be processed
in_filenames <- list.files(
file.path(in_directory, in_subfolders),
pattern = "\\.txt$",
full.names = TRUE,
recursive = TRUE)
# filter filenames, only
selected_in_filenames <-
seq(as.Date(date_range[1]),
as.Date(date_range[2]), by = "1 month") %>%
format("%Y-%m") %>%
lapply(function(x) stringr::str_subset(in_filenames, x)) %>%
unlist()
# read and aggregate each file separetely
mastertable <- rbindlist(
lapply(selected_in_filenames, function(fn) {
message("Processing file: ", fn)
temptable <- fread(fn,
colClasses = c(CUSTOMER_TIER = "character"),
na.strings = "")
# aggregate file but filtered for date_range
temptable[INVOICE_DT %between% date_range,
lapply(.SD, sum), by = .(CUST_ID, QTR = quarter(INVOICE_DT)),
.SDcols = c("Ext Sale")]
})
)[
# second aggregation overall
, lapply(.SD, sum), by = .(CUST_ID, QTR), .SDcols = c("Ext Sale")]
# Save Final table
print("Saving master table")
fwrite(mastertable, file.path(out_directory, out_filename))
# rm(mastertable) ### MODIFIED
print(Sys.time()-starttime)
mastertable
CUST_ID QTR Ext Sale
1: AK0010001 1 209.970
2: CO0020001 1 1540.300
3: CO0010001 1 -1.565
Note that date_range <- c("2017-01-01", "2017-02-14") now ends mid of February.

Related

Select CSV files and read in pairs

I am comparing two pairs of csv files each at a time. The files I have each end with a number like cars_file2.csv, Lorries_file3.csv, computers_file4.csv, phones_file5.csv. I have like 70 files per folder and the way I am comparing is, I compare cars_file2.csv and Lorries_file3.csv then Lorries_file3.csv and
computers_file4.csv, and the pattern is 2,3,3,4,4,5 like that. Is there a smart way I can handle this instead of manually coming back and change file like the way I am reading here or I can use the last number on each csv to read them smartly. NOTE the files have same suffixes _file:
library(daff)
setwd("path")
# Load csvs to compare into data frames
x_original <- read.csv("cars_file2.csv", strip.white=TRUE, stringsAsFactors = FALSE)
x_changed <- read.csv("Lorries_file3.csv", strip.white=TRUE, stringsAsFactors = FALSE)
render(diff_data(x_original,x_changed ,ignore_whitespace=TRUE,count_like_a_spreadsheet = FALSE))
My intention is to compare each two pairs of csv and recorded, Field additions, deletions and modified
You may want to load all files at once and do your comparison with a full list of files.
This may help:
# your path
path <- "insert your path"
# get folders in this path
dir_data <- as.list(list.dirs(path))
# get all filenames
dir_data <- lapply(dir_data,function(x){
# list of folders
files <- list.files(x)
files <- paste(x,files,sep="/")
# only .csv files
files <- files[substring(files,nchar(files)-3,nchar(files)) %in% ".csv"]
# remove possible errors
files <- files[!is.na(files)]
# save if there are files
if(length(files) >= 1){
return(files)
}
})
# delete NULL-values
dir_data <- compact(dir_data)
# make it a named vector
dir_data <- unique(unlist(dir_data))
names(dir_data) <- sub(pattern = "(.*)\\..*$", replacement = "\\1", basename(dir_data))
names(dir_data) <- as.numeric(substring(names(dir_data),nchar(names(dir_data)),nchar(names(dir_data))))
# remove possible NULL-values
dir_data <- dir_data[!is.na(names(dir_data))]
# make it a list again
dir_data <- as.list(dir_data)
# load data
data_upload <- lapply(dir_data,function(x){
if(file.exists(x)){
data <- read.csv(x,header=T,sep=";")
}else{
data <- "file not found"
}
return(data)
})
# setup for comparison
diffs <- lapply(as.character(sort(as.numeric(names(data_upload)))),function(x){
# check if the second dataset exists
if(as.character(as.numeric(x)+1) %in% names(data_upload)){
# first dataset
print(data_upload[[x]])
# second dataset
print(data_upload[[as.character(as.numeric(x)+1)]])
# do your operations here
comparison <- render(diff_data(data_upload[[x]],
data_upload[[as.character(as.numeric(x)+1)]],
ignore_whitespace=T,count_like_a_spreadsheet = F))
numbers <- c(x, as.numeric(x)+1)
# save both the comparison data and the numbers of the datasets
return(list(comparison,numbers))
}
})
# you can find the differences here
diffs
This script loads all csv-files in a folder and its sub-folders and puts them into a list by their numbers. In case there are no doubles, this will work. If you have doubles, you will have to adjust the part where the vector is named so that you can index the full names of the files afterwards.
A simple for- loop using paste will read-in the pairs:
for (i in 1:70) { # assuming the last pair is cars_file70.csv and Lorries_file71.csv
x_original <- read.csv(paste0("cars_file",i,".csv"), strip.white=TRUE, stringsAsFactors = FALSE)
x_changed <- read.csv(paste0("Lorries_file3",i+1,".csv"), strip.white=TRUE, stringsAsFactors = FALSE)
render(diff_data(x_original,x_changed ,ignore_whitespace=TRUE,count_like_a_spreadsheet = FALSE))
}
For simplicity I used 2 .csv files.
csv_1
1,2,4
csv_2
1,8,10
Load all the .csv files from folder,
files <- dir("Your folder path", pattern = '\\.csv', full.names = TRUE)
tables <- lapply(files, read.csv)
#create empty list to store comparison output
diff <- c()
Loop through all loaded files and compare,
for (pos in 1:length(csv)) {
if (pos != length(csv)) { #ignore last one
#save comparison output
diff[[pos]] <- diff_data(as.data.frame(csv[pos]), as.data.frame(csv[pos + 1]), ignore_whitespace=TRUE,count_like_a_spreadsheet = FALSE)
}
}
Compared output by diff
[[1]]
Daff Comparison: ‘as.data.frame(tables[pos])’ vs. ‘as.data.frame(tables[pos + 1])’
+++ +++ --- ---
## X1 X8 X10 X2 X4

How to call a script in another script in R

I have created a series of commands in R that get a job done using a specific URL. I would like to iterate the series of commands over a list of URLS that reside in a separate text file. How do I call the list into the commands one at a time?
I do not know what the proper terminology for this programming action. I've looked into scripting and batch programming but this is not what I want to do.
# URL that comes from list
URL <- "http://www.urlfromlist.com"
# Load URL
theurl <- getURL(URL,.opts = list(ssl.verifypeer = FALSE) )
# Read the tables
tables <- readHTMLTable(theurl)
# Create a list
tables <- list.clean(tables, fun = is.null, recursive = FALSE)
# Convert the list to a data frame
df <- do.call(rbind.data.frame, tables)
# Save dataframe out as a csv file
write.csv(df2, file = dynamicname, row.names=FALSE)
The above code is what I am doing. The first variable needs to be a different URL each time from a list - rinse and repeat. Thanks!
UPDATED CODE - this is still not writing out any files but runs.
# Function to pull tables from list of URLs
URLfunction<- function(x){
# URL that comes from list
URL <- x
# Load URL
theurl <- RCurl::getURL(URL,.opts = list(ssl.verifypeer = FALSE) )
# Read the tables
tables <- XML::readHTMLTable(theurl)
# Create a list
tables <- rlist::list.clean(tables, fun = is.null, recursive = FALSE)
# Convert the list to a data frame
df <- do.call(rbind,tables)
# Split date and time column out
df2 <- separate(df, "Date / Time", c("Date", "Time"), sep = " ")
# Fill the missing column with text, in this case shapename
shapename <- qdapRegex::ex_between(URL, "ndxs", ".html")
df2$Shape <- shapename
# Save dataframe out as a csv file
write.csv(result, paste0(shapename, '.csv', row.names=FALSE))
return(df2)
}
URL <- read.csv("PATH", header = FALSE)
purrr::map_df(URL, URLfunction) ## Also tried purrr::map_df(URL[,1], URLfunction)
If i understand your question correctly,
my answer could be work with your problem.
Used library
library(RCurl)
library(XML)
library(rlist)
library(purrr)
Define function
URLfunction<- function(x){
# URL that comes from list
URL <- x
# Load URL
theurl <- RCurl::getURL(URL,.opts = list(ssl.verifypeer = FALSE) )
# Read the tables
tables <- XML::readHTMLTable(theurl)
# Create a list
tables <- rlist::list.clean(tables, fun = is.null, recursive = FALSE)
# Convert the list to a data frame
df <- do.call(rbind,tables)
# Save dataframe out as a csv file
return(df)
}
Assume you have a data like below
( I am not sure what data looks like you have )
URL <- c("https://stackoverflow.com/questions/56139810/how-to-call-a-script-in-another-script-in-r",
"https://stackoverflow.com/questions/56122052/labelling-points-on-a-highcharter-scatter-chart/56123057?noredirect=1#comment98909916_56123057")
result<- purrr::map(URL, URLfunction)
result <- do.call(rbind, result)
Write.csv is last step
If you want write.csv by each URL , plz move in to URLfunction
write.csv(result, file = dynamicname, row.names=FALSE)
Aditional
List version
URL <- list("https://stackoverflow.com/questions/56139810/how-to-call-a-script-in-another-script-in-r",
"https://stackoverflow.com/questions/56122052/labelling-points-on-a-highcharter-scatter-chart/56123057?noredirect=1#comment98909916_56123057")
result<- purrr::map_df(URL, URLfunction)
>result
asked today yesterday
1 viewed 35 times <NA>
2 active today <NA>
3 viewed <NA> 34 times
4 active <NA> today
CSV
URL <- read.csv("PATH",header = FALSE)
result<- purrr::map_df(URL[,1], URLfunction)
>result
asked today yesterday
1 viewed 35 times <NA>
2 active today <NA>
3 viewed <NA> 34 times
4 active <NA> today
Add edited version of your code.
URLfunction<- function(x){
# URL that comes from list
URL <- x
# Load URL
theurl <- RCurl::getURL(URL,.opts = list(ssl.verifypeer = FALSE) )
# Read the tables
tables <- XML::readHTMLTable(theurl)
# Create a list
tables <- rlist::list.clean(tables, fun = is.null, recursive = FALSE)
# Convert the list to a data frame
df <- do.call(rbind,tables)
# Split date and time column out
df2 <- tidyr::separate(df, "Date / Time", c("Date", "Time"), sep = " ")
# Fill the missing column with text, in this case shapename
shapename <- unlist(qdapRegex::ex_between(URL, "ndxs", ".html"))
# qdapRegex::ex_between returns list type, when it added to df2 it couldn't be saved.
# So i added 'unlist'
df2$Shape <- shapename
# Save dataframe out as a csv file
write.csv(df2, paste0(shapename, '.csv'), row.names=FALSE)
# Here are two error.
# First, You maked the data named 'df2' not 'result'. So i changed result -->df2
# Second, row.names is not the 'paste0' attributes, it is 'write.csv's attributes.
return(df2)
}
After defining above function,
URL = c("nuforc.org/webreports/ndxsRectangle.html",
"nuforc.org/webreports/ndxsRound.html")
RESULT = purrr::map_df(URL, URLfunction) ## Also tried purrr::map_df(URL[,1], URLfunction)
Finally, i get the result below
1. Rectangle.csv, Round.csv files on your desktop(Saved path).
2. Returning row binded data frame looks like below (2011 x 8)
> RESULT[1,]
Date Time City State Shape Duration
1 5/2/19 00:20 Honolulu HI Rectangle 3 seconds
Summary
1 Several of rectangles connected in different LED like colors. Such as red, green, blue, etc. ;above Waikiki. ((anonymous report))
Posted
1 5/9/19

row not consolidating duplicates in R when using multiple months in Date Filter

I am using the following code to summarize my data by a column
library(data.table, warn.conflicts = FALSE)
library(lubridate, warn.conflicts = FALSE)
################
## PARAMETERS ##
################
# Set path of major source folder for raw transaction data
in_directory <- "C:/Users/NAME/Documents/Raw Data/"
# List names of sub-folders (currently grouped by first two characters of CUST_ID)
in_subfolders <- list("AA-CA", "CB-HZ")
# Set location for output
out_directory <- "C:/Users/NAME/Documents/YTD Master/"
out_filename <- "OUTPUT.csv"
# Set beginning and end of date range to be collected - year-month-day format
date_range <- interval(as.Date("2017-01-01"), as.Date("2017-01-31"))
# Enable or disable filtering of raw files to only grab items bought within certain months to save space.
# If false, all files will be scanned for unique items, which will take longer and be a larger file.
date_filter <- TRUE
##########
## CODE ##
##########
starttime <- Sys.time()
mastertable <- NULL
for (j in 1:length(in_subfolders)) {
subfolder <- in_subfolders[j]
sub_directory <- paste0(in_directory, subfolder, "/")
## IMPORT DATA
in_filenames <- dir(sub_directory, pattern =".txt")
for (i in 1:length(in_filenames)) {
# Default value provided for when fast filtering is disabled.
read_this_file <- TRUE
# To fast filter the data, we choose to include or exclude an entire file based on the date of its first line.
# WARNING: This is only a valid method if filtering by entire months, since that is the amount of data housed in each file.
if (date_filter) {
temptable <- fread(paste0(sub_directory, in_filenames[i]), colClasses=c(CUSTOMER_TIER = "character"),
na.strings = "", nrows = 1)
temptable[, INVOICE_DT := as.Date(INVOICE_DT)]
# If date matches, set read flag to TRUE. If date does not match, set read flag to FALSE.
read_this_file <- temptable[, INVOICE_DT] %within% date_range
}
if (read_this_file) {
print(Sys.time()-starttime)
print(paste0("Reading in ", in_filenames[i]))
temptable <- fread(paste0(sub_directory, in_filenames[i]), colClasses = c(CUSTOMER_TIER = "character"),
na.strings = "")
temptable <- temptable[,lapply(.SD, sum), by = .(CUST_ID),
.SDcols = c("Ext Sale")]
# Combine into full list
mastertable <- rbindlist(list(mastertable, temptable), use.names = TRUE)
# Release unneeded memory
rm(temptable)
}
}
}
# Save Final table
print("Saving master table")
fwrite(mastertable, paste0(out_directory, out_filename))
rm(mastertable)
print(Sys.time()-starttime)
The output i receive after running the above script for the month of January is as below and this is the output I expect.
CUST_ID Ext Sale
AK0010001 209.97
CO0020001 1540.3
The problem arises when i use multiple months. Below is the output I receive when I run Jan-Feb date_range <- interval(as.Date("2017-01-01"), as.Date("2017-02-28"))
CUST_ID Ext Sale
AK0010001 209.97
AK0010001 217.833
CO0020001 1540.3
CO0010001 -179.765
As you can see in the output above the CUST_ID is no longer consolidating.
Does anyone know why this would be happening?
Below I have provided some data to reproduce what I am working with. Just save the files into 4 separate text file and into folders like I have it in my code.
I have 2 separate folders saved as "AA-CA" and "CB-HZ"
File 1 saved as "AA-CA 2017-01.txt"
INVOICE_DT,BRANCH_CODE,INVOICE_NO,INV_SEQ_NO,INV_ITEM_ID,ITEM_DESCR,STD_ITEM,PRIVATE_LABEL,CATEGORY_PATH1,CATEGORY_PATH2,CUST_ID,CUSTOMER_TIER,IS_VENDING,SALE_PRICE,TOTAL_COST,POS_COST,CE100,CE110,CE120,CE200,CORP_PRICE,QTY_SOLD,PACKSLIP_WHSL,PRICING_GROUP,PGG_MIN_PRICE,PGY_MIN_PRICE,PGR_MIN_PRICE,Ext Sale,Ext Total Cost
2017-01-27,AK001,AK0016997,4,12772-00079,"3.75"""""""" 4.12"""""""" HOSE OD",N,N,08.5-Fleet & Automotive,01.6-DOT Hose & Tubing,AK0010001,Tier 3,No,42.74,22.438335,22.438335,21.37,,,0,,3,,PGR,168.2875125,134.63001,112.191675,128.22,67.315005
2017-01-27,AK001,AK0016997,3,12772-00022,"2.5"""""""" 2.87"""""""" HOSE OD C",N,N,08-Hydraulics & Pneumatics,02-Hose and Hose Reels,AK0010001,Tier 3,No,27.25,14.143396,14.143396,13.47,,,0,,3,,PGR,106.07547,84.860376,70.71698,81.75,42.430188
File 2 saved as "AA-CA 2017-02.txt"
INVOICE_DT,BRANCH_CODE,INVOICE_NO,INV_SEQ_NO,INV_ITEM_ID,ITEM_DESCR,STD_ITEM,PRIVATE_LABEL,CATEGORY_PATH1,CATEGORY_PATH2,CUST_ID,CUSTOMER_TIER,IS_VENDING,SALE_PRICE,TOTAL_COST,POS_COST,CE100,CE110,CE120,CE200,CORP_PRICE,QTY_SOLD,PACKSLIP_WHSL,PRICING_GROUP,PGG_MIN_PRICE,PGY_MIN_PRICE,PGR_MIN_PRICE,Ext Sale,Ext Total Cost
2017-02-28,AK001,AK0017107,1,12772-00307,3-WAY MALE HOUSING,N,N,09-Electrical,05.5-Terminals and Wire Connectors,AK0010001,Tier 3,No,95.21,74.591453,74.591453,71.04,,,0,,1,,PGG,0,0,0,95.21,74.591453
2017-02-28,AK001,AK0017105,3,99523968,PC58570 1/2 PRS BALL,Y,N,,,AK0010001,Tier 3,No,24.5246,12.356039,12.356039,11.767743,,,0,,5,,PGG,0,0,0,122.623,61.780195
File 3 saved as "CB-HZ 2017-01.txt"
INVOICE_DT,BRANCH_CODE,INVOICE_NO,INV_SEQ_NO,INV_ITEM_ID,ITEM_DESCR,STD_ITEM,PRIVATE_LABEL,CATEGORY_PATH1,CATEGORY_PATH2,CUST_ID,CUSTOMER_TIER,IS_VENDING,SALE_PRICE,TOTAL_COST,POS_COST,CE100,CE110,CE120,CE200,CORP_PRICE,QTY_SOLD,PACKSLIP_WHSL,PRICING_GROUP,PGG_MIN_PRICE,PGY_MIN_PRICE,PGR_MIN_PRICE,Ext Sale,Ext Total Cost
2017-01-31,CO002,CO0023603,19,13117-00095,8-32X5/16 BHSCS MAG,N,N,18-Work Order Parts,Finished Products,CO0020001,Tier 3,No,0.1858,0.037528,0.037528,0.01833,,,0,,6000,,PGG,0,0,0,1114.8,225.168
2017-01-31,CO002,CO0023603,20,13117-00186,"#8-16X3/4"""""""" 6-LOBE PA",N,N,01-Fasteners,03-Screws,CO0020001,Tier 3,No,0.0851,0.029652,0.029652,,,,0,,5000,,PGG,0,0,0,425.5,148.26
File 4 saved as "CB-HZ 2017-02.txt"
INVOICE_DT,BRANCH_CODE,INVOICE_NO,INV_SEQ_NO,INV_ITEM_ID,ITEM_DESCR,STD_ITEM,PRIVATE_LABEL,CATEGORY_PATH1,CATEGORY_PATH2,CUST_ID,CUSTOMER_TIER,IS_VENDING,SALE_PRICE,TOTAL_COST,POS_COST,CE100,CE110,CE120,CE200,CORP_PRICE,QTY_SOLD,PACKSLIP_WHSL,PRICING_GROUP,PGG_MIN_PRICE,PGY_MIN_PRICE,PGR_MIN_PRICE,Ext Sale,Ext Total Cost
2017-02-03,CO001,CO0019017,1,MN2550000A20000,M6-1.0 HEX NUT A-2,Y,N,01-Fasteners,04-Nuts,CO0010001,NA,No,0.0313,0.00767,0.00767,0.006215,0.000593,,0.001241,,-50,0.1058,,,,,-1.565,-0.3835
2017-02-16,CO001,CO0019018,1,11516769,RS37518BlkRndSpacer,Y,N,01.5-Hardware,Electronic Hardware,CO0010001,NA,No,0.0396,0.011245,0.011245,0.01071,,,0,,-4500,0.0543,,,,,-178.2,-50.6025
I have the data saved in 2 separate folders.
The OP is wondering why the result is not consolidated for CUST_ID if more than one month of data is processed.
The reason is that the monthly files are read in and aggregated one by one but a final aggregation step is needed to consolidate over all months.
The code below is a simplified replacement of the double for loops. I have left out the code for testing for "fast filtering".
The first part creates a list of files to be processed. The second part does the processing.
# create vector of filenames to be processed
in_filenames <- list.files(
file.path(in_directory, in_subfolders),
pattern = "\\.txt$",
full.names = TRUE,
recursive = TRUE)
# read and aggregate each file separately
mastertable <- rbindlist(
lapply(in_filenames, function(fn) {
# code for "fast filter" test goes here
message("Reading in ", fn)
temptable <- fread(fn,
colClasses = c(CUSTOMER_TIER = "character"),
na.strings = "")
# aggregate
temptable[, lapply(.SD, sum), by = .(CUST_ID), .SDcols = c("Ext Sale")]
})
)[
# THIS IS THE MISSING STEP:
# second aggregation for overall totals
, lapply(.SD, sum), by = .(CUST_ID), .SDcols = c("Ext Sale")]
Processing file: Raw Data/AA-CA/AA-CA 2017-01.txt
Processing file: Raw Data/AA-CA/AA-CA 2017-02.txt
Processing file: Raw Data/CB-HZ/CB-HZ 2017-01.txt
Processing file: Raw Data/CB-HZ/CB-HZ 2017-02.txt
mastertable
CUST_ID Ext Sale
1: AK0010001 427.803
2: CO0020001 1540.300
3: CO0010001 -179.765
Note that chaining of data.table expressions is used here.
Edit 1:
By request of the OP, here is the complete code (except for the "fast filtering" stuff). There are some additional lines which where modified. They are marked with ### MODIFIED.
library(data.table, warn.conflicts = FALSE)
library(lubridate, warn.conflicts = FALSE)
################
## PARAMETERS ##
################
# Set path of major source folder for raw transaction data
in_directory <- "Raw Data" ### MODIFIED
# List names of sub-folders (currently grouped by first two characters of CUST_ID)
in_subfolders <- list("AA-CA", "CB-HZ")
# Set location for output
out_directory <- "YTD Master" ### MODIFIED
out_filename <- "OUTPUT.csv"
# Set beginning and end of date range to be collected - year-month-day format
date_range <- interval(as.Date("2017-01-01"), as.Date("2017-02-28")) ### MODIFIED
# Enable or disable filtering of raw files to only grab items bought within certain months to save space.
# If false, all files will be scanned for unique items, which will take longer and be a larger file.
date_filter <- TRUE
##########
## CODE ##
##########
starttime <- Sys.time()
# create vector of filenames to be processed
in_filenames <- list.files(
file.path(in_directory, in_subfolders),
pattern = "\\.txt$",
full.names = TRUE,
recursive = TRUE)
# read and aggregate each file separetely
mastertable <- rbindlist(
lapply(in_filenames, function(fn) {
# code for fast filter test goes here
message("Processing file: ", fn)
temptable <- fread(fn,
colClasses = c(CUSTOMER_TIER = "character"),
na.strings = "")
# aggregate by month
temptable[, lapply(.SD, sum), by = .(CUST_ID), .SDcols = c("Ext Sale")]
})
)[
# second aggregation overall
, lapply(.SD, sum), by = .(CUST_ID), .SDcols = c("Ext Sale")]
# Save Final table
print("Saving master table")
fwrite(mastertable, paste0(out_directory, out_filename))
# rm(mastertable) ### MODIFIED
print(Sys.time()-starttime)
Edit 2
The OP has asked to include the "fast filter" code which I had omitted for brevity.
However, I have a different approach. Instead of reading the first line of each file to check if INVOICE_DT is within the given date_range my approach filters the file names. The file names contain the year-month in ISO 8601 format.
So, a vector of allowed year-month strings is constructed from the given date_range. Only those file names which contain one of the allowed year-month strings are selected for further processing.
However, selecting the proper files is only the first step. As the date-range may start or end right in the middel of a month, we need also to filter the rows of each processed file. This step is missing from OP's code.
library(data.table, warn.conflicts = FALSE)
library(magrittr) ### MODIFIED
# library(lubridate, warn.conflicts = FALSE) ### MODIFIED
################
## PARAMETERS ##
################
# Set path of major source folder for raw transaction data
in_directory <- "Raw Data" ### MODIFIED
# List names of sub-folders (currently grouped by first two characters of CUST_ID)
in_subfolders <- list("AA-CA", "CB-HZ")
# Set location for output
out_directory <- "YTD Master" ### MODIFIED
out_filename <- "OUTPUT.csv"
# Set beginning and end of date range to be collected - year-month-day format
date_range <- c("2017-01-01", "2017-02-14") ### MODIFIED
# Enable or disable filtering of raw files to only grab items bought within certain months to save space.
# If false, all files will be scanned for unique items, which will take longer and be a larger file.
# date_filter <- TRUE ### MODIFIED
##########
## CODE ##
##########
starttime <- Sys.time()
# create vector of filenames to be processed
in_filenames <- list.files(
file.path(in_directory, in_subfolders),
pattern = "\\.txt$",
full.names = TRUE,
recursive = TRUE)
# filter filenames, only
selected_in_filenames <-
seq(as.Date(date_range[1]),
as.Date(date_range[2]), by = "1 month") %>%
format("%Y-%m") %>%
lapply(function(x) stringr::str_subset(in_filenames, x)) %>%
unlist()
# read and aggregate each file separetely
mastertable <- rbindlist(
lapply(selected_in_filenames, function(fn) {
message("Processing file: ", fn)
temptable <- fread(fn,
colClasses = c(CUSTOMER_TIER = "character"),
na.strings = "")
# aggregate file but filtered for date_range
temptable[INVOICE_DT %between% date_range,
lapply(.SD, sum), by = .(CUST_ID, QTR = quarter(INVOICE_DT)),
.SDcols = c("Ext Sale")]
})
)[
# second aggregation overall
, lapply(.SD, sum), by = .(CUST_ID, QTR), .SDcols = c("Ext Sale")]
# Save Final table
print("Saving master table")
fwrite(mastertable, file.path(out_directory, out_filename))
# rm(mastertable) ### MODIFIED
print(Sys.time()-starttime)
mastertable
CUST_ID QTR Ext Sale
1: AK0010001 1 209.970
2: CO0020001 1 1540.300
3: CO0010001 1 -1.565
Note that date_range <- c("2017-01-01", "2017-02-14") now ends mid of February.

How to get a subset of Data that is read in by folder? [duplicate]

I am using the following code to summarize my data by a column
library(data.table, warn.conflicts = FALSE)
library(lubridate, warn.conflicts = FALSE)
################
## PARAMETERS ##
################
# Set path of major source folder for raw transaction data
in_directory <- "C:/Users/NAME/Documents/Raw Data/"
# List names of sub-folders (currently grouped by first two characters of CUST_ID)
in_subfolders <- list("AA-CA", "CB-HZ")
# Set location for output
out_directory <- "C:/Users/NAME/Documents/YTD Master/"
out_filename <- "OUTPUT.csv"
# Set beginning and end of date range to be collected - year-month-day format
date_range <- interval(as.Date("2017-01-01"), as.Date("2017-01-31"))
# Enable or disable filtering of raw files to only grab items bought within certain months to save space.
# If false, all files will be scanned for unique items, which will take longer and be a larger file.
date_filter <- TRUE
##########
## CODE ##
##########
starttime <- Sys.time()
mastertable <- NULL
for (j in 1:length(in_subfolders)) {
subfolder <- in_subfolders[j]
sub_directory <- paste0(in_directory, subfolder, "/")
## IMPORT DATA
in_filenames <- dir(sub_directory, pattern =".txt")
for (i in 1:length(in_filenames)) {
# Default value provided for when fast filtering is disabled.
read_this_file <- TRUE
# To fast filter the data, we choose to include or exclude an entire file based on the date of its first line.
# WARNING: This is only a valid method if filtering by entire months, since that is the amount of data housed in each file.
if (date_filter) {
temptable <- fread(paste0(sub_directory, in_filenames[i]), colClasses=c(CUSTOMER_TIER = "character"),
na.strings = "", nrows = 1)
temptable[, INVOICE_DT := as.Date(INVOICE_DT)]
# If date matches, set read flag to TRUE. If date does not match, set read flag to FALSE.
read_this_file <- temptable[, INVOICE_DT] %within% date_range
}
if (read_this_file) {
print(Sys.time()-starttime)
print(paste0("Reading in ", in_filenames[i]))
temptable <- fread(paste0(sub_directory, in_filenames[i]), colClasses = c(CUSTOMER_TIER = "character"),
na.strings = "")
temptable <- temptable[,lapply(.SD, sum), by = .(CUST_ID),
.SDcols = c("Ext Sale")]
# Combine into full list
mastertable <- rbindlist(list(mastertable, temptable), use.names = TRUE)
# Release unneeded memory
rm(temptable)
}
}
}
# Save Final table
print("Saving master table")
fwrite(mastertable, paste0(out_directory, out_filename))
rm(mastertable)
print(Sys.time()-starttime)
The output i receive after running the above script for the month of January is as below and this is the output I expect.
CUST_ID Ext Sale
AK0010001 209.97
CO0020001 1540.3
The problem arises when i use multiple months. Below is the output I receive when I run Jan-Feb date_range <- interval(as.Date("2017-01-01"), as.Date("2017-02-28"))
CUST_ID Ext Sale
AK0010001 209.97
AK0010001 217.833
CO0020001 1540.3
CO0010001 -179.765
As you can see in the output above the CUST_ID is no longer consolidating.
Does anyone know why this would be happening?
Below I have provided some data to reproduce what I am working with. Just save the files into 4 separate text file and into folders like I have it in my code.
I have 2 separate folders saved as "AA-CA" and "CB-HZ"
File 1 saved as "AA-CA 2017-01.txt"
INVOICE_DT,BRANCH_CODE,INVOICE_NO,INV_SEQ_NO,INV_ITEM_ID,ITEM_DESCR,STD_ITEM,PRIVATE_LABEL,CATEGORY_PATH1,CATEGORY_PATH2,CUST_ID,CUSTOMER_TIER,IS_VENDING,SALE_PRICE,TOTAL_COST,POS_COST,CE100,CE110,CE120,CE200,CORP_PRICE,QTY_SOLD,PACKSLIP_WHSL,PRICING_GROUP,PGG_MIN_PRICE,PGY_MIN_PRICE,PGR_MIN_PRICE,Ext Sale,Ext Total Cost
2017-01-27,AK001,AK0016997,4,12772-00079,"3.75"""""""" 4.12"""""""" HOSE OD",N,N,08.5-Fleet & Automotive,01.6-DOT Hose & Tubing,AK0010001,Tier 3,No,42.74,22.438335,22.438335,21.37,,,0,,3,,PGR,168.2875125,134.63001,112.191675,128.22,67.315005
2017-01-27,AK001,AK0016997,3,12772-00022,"2.5"""""""" 2.87"""""""" HOSE OD C",N,N,08-Hydraulics & Pneumatics,02-Hose and Hose Reels,AK0010001,Tier 3,No,27.25,14.143396,14.143396,13.47,,,0,,3,,PGR,106.07547,84.860376,70.71698,81.75,42.430188
File 2 saved as "AA-CA 2017-02.txt"
INVOICE_DT,BRANCH_CODE,INVOICE_NO,INV_SEQ_NO,INV_ITEM_ID,ITEM_DESCR,STD_ITEM,PRIVATE_LABEL,CATEGORY_PATH1,CATEGORY_PATH2,CUST_ID,CUSTOMER_TIER,IS_VENDING,SALE_PRICE,TOTAL_COST,POS_COST,CE100,CE110,CE120,CE200,CORP_PRICE,QTY_SOLD,PACKSLIP_WHSL,PRICING_GROUP,PGG_MIN_PRICE,PGY_MIN_PRICE,PGR_MIN_PRICE,Ext Sale,Ext Total Cost
2017-02-28,AK001,AK0017107,1,12772-00307,3-WAY MALE HOUSING,N,N,09-Electrical,05.5-Terminals and Wire Connectors,AK0010001,Tier 3,No,95.21,74.591453,74.591453,71.04,,,0,,1,,PGG,0,0,0,95.21,74.591453
2017-02-28,AK001,AK0017105,3,99523968,PC58570 1/2 PRS BALL,Y,N,,,AK0010001,Tier 3,No,24.5246,12.356039,12.356039,11.767743,,,0,,5,,PGG,0,0,0,122.623,61.780195
File 3 saved as "CB-HZ 2017-01.txt"
INVOICE_DT,BRANCH_CODE,INVOICE_NO,INV_SEQ_NO,INV_ITEM_ID,ITEM_DESCR,STD_ITEM,PRIVATE_LABEL,CATEGORY_PATH1,CATEGORY_PATH2,CUST_ID,CUSTOMER_TIER,IS_VENDING,SALE_PRICE,TOTAL_COST,POS_COST,CE100,CE110,CE120,CE200,CORP_PRICE,QTY_SOLD,PACKSLIP_WHSL,PRICING_GROUP,PGG_MIN_PRICE,PGY_MIN_PRICE,PGR_MIN_PRICE,Ext Sale,Ext Total Cost
2017-01-31,CO002,CO0023603,19,13117-00095,8-32X5/16 BHSCS MAG,N,N,18-Work Order Parts,Finished Products,CO0020001,Tier 3,No,0.1858,0.037528,0.037528,0.01833,,,0,,6000,,PGG,0,0,0,1114.8,225.168
2017-01-31,CO002,CO0023603,20,13117-00186,"#8-16X3/4"""""""" 6-LOBE PA",N,N,01-Fasteners,03-Screws,CO0020001,Tier 3,No,0.0851,0.029652,0.029652,,,,0,,5000,,PGG,0,0,0,425.5,148.26
File 4 saved as "CB-HZ 2017-02.txt"
INVOICE_DT,BRANCH_CODE,INVOICE_NO,INV_SEQ_NO,INV_ITEM_ID,ITEM_DESCR,STD_ITEM,PRIVATE_LABEL,CATEGORY_PATH1,CATEGORY_PATH2,CUST_ID,CUSTOMER_TIER,IS_VENDING,SALE_PRICE,TOTAL_COST,POS_COST,CE100,CE110,CE120,CE200,CORP_PRICE,QTY_SOLD,PACKSLIP_WHSL,PRICING_GROUP,PGG_MIN_PRICE,PGY_MIN_PRICE,PGR_MIN_PRICE,Ext Sale,Ext Total Cost
2017-02-03,CO001,CO0019017,1,MN2550000A20000,M6-1.0 HEX NUT A-2,Y,N,01-Fasteners,04-Nuts,CO0010001,NA,No,0.0313,0.00767,0.00767,0.006215,0.000593,,0.001241,,-50,0.1058,,,,,-1.565,-0.3835
2017-02-16,CO001,CO0019018,1,11516769,RS37518BlkRndSpacer,Y,N,01.5-Hardware,Electronic Hardware,CO0010001,NA,No,0.0396,0.011245,0.011245,0.01071,,,0,,-4500,0.0543,,,,,-178.2,-50.6025
I have the data saved in 2 separate folders.
The OP is wondering why the result is not consolidated for CUST_ID if more than one month of data is processed.
The reason is that the monthly files are read in and aggregated one by one but a final aggregation step is needed to consolidate over all months.
The code below is a simplified replacement of the double for loops. I have left out the code for testing for "fast filtering".
The first part creates a list of files to be processed. The second part does the processing.
# create vector of filenames to be processed
in_filenames <- list.files(
file.path(in_directory, in_subfolders),
pattern = "\\.txt$",
full.names = TRUE,
recursive = TRUE)
# read and aggregate each file separately
mastertable <- rbindlist(
lapply(in_filenames, function(fn) {
# code for "fast filter" test goes here
message("Reading in ", fn)
temptable <- fread(fn,
colClasses = c(CUSTOMER_TIER = "character"),
na.strings = "")
# aggregate
temptable[, lapply(.SD, sum), by = .(CUST_ID), .SDcols = c("Ext Sale")]
})
)[
# THIS IS THE MISSING STEP:
# second aggregation for overall totals
, lapply(.SD, sum), by = .(CUST_ID), .SDcols = c("Ext Sale")]
Processing file: Raw Data/AA-CA/AA-CA 2017-01.txt
Processing file: Raw Data/AA-CA/AA-CA 2017-02.txt
Processing file: Raw Data/CB-HZ/CB-HZ 2017-01.txt
Processing file: Raw Data/CB-HZ/CB-HZ 2017-02.txt
mastertable
CUST_ID Ext Sale
1: AK0010001 427.803
2: CO0020001 1540.300
3: CO0010001 -179.765
Note that chaining of data.table expressions is used here.
Edit 1:
By request of the OP, here is the complete code (except for the "fast filtering" stuff). There are some additional lines which where modified. They are marked with ### MODIFIED.
library(data.table, warn.conflicts = FALSE)
library(lubridate, warn.conflicts = FALSE)
################
## PARAMETERS ##
################
# Set path of major source folder for raw transaction data
in_directory <- "Raw Data" ### MODIFIED
# List names of sub-folders (currently grouped by first two characters of CUST_ID)
in_subfolders <- list("AA-CA", "CB-HZ")
# Set location for output
out_directory <- "YTD Master" ### MODIFIED
out_filename <- "OUTPUT.csv"
# Set beginning and end of date range to be collected - year-month-day format
date_range <- interval(as.Date("2017-01-01"), as.Date("2017-02-28")) ### MODIFIED
# Enable or disable filtering of raw files to only grab items bought within certain months to save space.
# If false, all files will be scanned for unique items, which will take longer and be a larger file.
date_filter <- TRUE
##########
## CODE ##
##########
starttime <- Sys.time()
# create vector of filenames to be processed
in_filenames <- list.files(
file.path(in_directory, in_subfolders),
pattern = "\\.txt$",
full.names = TRUE,
recursive = TRUE)
# read and aggregate each file separetely
mastertable <- rbindlist(
lapply(in_filenames, function(fn) {
# code for fast filter test goes here
message("Processing file: ", fn)
temptable <- fread(fn,
colClasses = c(CUSTOMER_TIER = "character"),
na.strings = "")
# aggregate by month
temptable[, lapply(.SD, sum), by = .(CUST_ID), .SDcols = c("Ext Sale")]
})
)[
# second aggregation overall
, lapply(.SD, sum), by = .(CUST_ID), .SDcols = c("Ext Sale")]
# Save Final table
print("Saving master table")
fwrite(mastertable, paste0(out_directory, out_filename))
# rm(mastertable) ### MODIFIED
print(Sys.time()-starttime)
Edit 2
The OP has asked to include the "fast filter" code which I had omitted for brevity.
However, I have a different approach. Instead of reading the first line of each file to check if INVOICE_DT is within the given date_range my approach filters the file names. The file names contain the year-month in ISO 8601 format.
So, a vector of allowed year-month strings is constructed from the given date_range. Only those file names which contain one of the allowed year-month strings are selected for further processing.
However, selecting the proper files is only the first step. As the date-range may start or end right in the middel of a month, we need also to filter the rows of each processed file. This step is missing from OP's code.
library(data.table, warn.conflicts = FALSE)
library(magrittr) ### MODIFIED
# library(lubridate, warn.conflicts = FALSE) ### MODIFIED
################
## PARAMETERS ##
################
# Set path of major source folder for raw transaction data
in_directory <- "Raw Data" ### MODIFIED
# List names of sub-folders (currently grouped by first two characters of CUST_ID)
in_subfolders <- list("AA-CA", "CB-HZ")
# Set location for output
out_directory <- "YTD Master" ### MODIFIED
out_filename <- "OUTPUT.csv"
# Set beginning and end of date range to be collected - year-month-day format
date_range <- c("2017-01-01", "2017-02-14") ### MODIFIED
# Enable or disable filtering of raw files to only grab items bought within certain months to save space.
# If false, all files will be scanned for unique items, which will take longer and be a larger file.
# date_filter <- TRUE ### MODIFIED
##########
## CODE ##
##########
starttime <- Sys.time()
# create vector of filenames to be processed
in_filenames <- list.files(
file.path(in_directory, in_subfolders),
pattern = "\\.txt$",
full.names = TRUE,
recursive = TRUE)
# filter filenames, only
selected_in_filenames <-
seq(as.Date(date_range[1]),
as.Date(date_range[2]), by = "1 month") %>%
format("%Y-%m") %>%
lapply(function(x) stringr::str_subset(in_filenames, x)) %>%
unlist()
# read and aggregate each file separetely
mastertable <- rbindlist(
lapply(selected_in_filenames, function(fn) {
message("Processing file: ", fn)
temptable <- fread(fn,
colClasses = c(CUSTOMER_TIER = "character"),
na.strings = "")
# aggregate file but filtered for date_range
temptable[INVOICE_DT %between% date_range,
lapply(.SD, sum), by = .(CUST_ID, QTR = quarter(INVOICE_DT)),
.SDcols = c("Ext Sale")]
})
)[
# second aggregation overall
, lapply(.SD, sum), by = .(CUST_ID, QTR), .SDcols = c("Ext Sale")]
# Save Final table
print("Saving master table")
fwrite(mastertable, file.path(out_directory, out_filename))
# rm(mastertable) ### MODIFIED
print(Sys.time()-starttime)
mastertable
CUST_ID QTR Ext Sale
1: AK0010001 1 209.970
2: CO0020001 1 1540.300
3: CO0010001 1 -1.565
Note that date_range <- c("2017-01-01", "2017-02-14") now ends mid of February.

How to summarize by Quarter in R

I am having some difficulties on summarizing data from my database in R. I am looking to pull the data and have it summarized by Quarter.
Below is the code i am using to get a txt output but I am getting errors.
What do I need to do to manipulate the code to run this so that I can have the data be summarized by quarter?
library(data.table, warn.conflicts = FALSE)
library(lubridate, warn.conflicts = FALSE)
################
## PARAMETERS ##
################
# Set path of major source folder for raw transaction data
in_directory <- "C:/Users/name/Documents/Raw Data/"
# List names of sub-folders (currently grouped by first two characters of
CUST_ID)
in_subfolders <- list("AA-CA", "CB-HZ", "IA-IL", "IM-KZ", "LA-MI", "MJ-MS",
"MT-NV", "NW-OH", "OI-PZ", "QA-TN", "TO-UZ",
"VA-WA", "WB-ZZ")
# Set location for output
out_directory <- "C:/Users/name/Documents/YTD Master/"
out_filename <- "NEW.csv"
# Set beginning and end of date range to be collected - year-month-day format
date_range <- interval(as.Date("2018-01-01"), as.Date("2018-05-31"))
# Enable or disable filtering of raw files to only grab items bought within
certain months to save space.
# If false, all files will be scanned for unique items, which will take
longer and be a larger file.
date_filter <- TRUE
##########
## CODE ##
##########
starttime <- Sys.time()
mastertable <- NULL
for (j in 1:length(in_subfolders)) {
subfolder <- in_subfolders[j]
sub_directory <- paste0(in_directory, subfolder, "/")
## IMPORT DATA
in_filenames <- dir(sub_directory, pattern =".txt")
for (i in 1:length(in_filenames)) {
# Default value provided for when fast filtering is disabled.
read_this_file <- TRUE
# To fast filter the data, we choose to include or exclude an entire file
based on the date of its first line.
# WARNING: This is only a valid method if filtering by entire months,
since that is the amount of data housed in each file.
if (date_filter) {
temptable <- fread(paste0(sub_directory, in_filenames[i]),
colClasses=c(CUSTOMER_TIER = "character"),
na.strings = "", nrows = 1)
temptable[, INVOICE_DT := as.Date(INVOICE_DT)]
# If date matches, set read flag to TRUE. If date does not match, set
read flag to FALSE.
read_this_file <- temptable[, INVOICE_DT] %within% date_range
}
if (read_this_file) {
print(Sys.time()-starttime)
print(paste0("Reading in ", in_filenames[i]))
temptable <- fread(paste0(sub_directory, in_filenames[i]), colClasses=c(CUSTOMER_TIER = "character"),
na.strings = "")
temptable <- temptable[, lapply(.SD, sum), by = quarter(INVOICE_DT),
.SDcols = c("INV_ITEM_ID","Ext Sale", "Ext Total Cost", "CE100", "CE110","CE120","QTY_SOLD","PACKSLIP_WHSL")]
# Combine into full list
mastertable <- rbindlist(list(mastertable, temptable), use.names = TRUE)
# Release unneeded memory
rm(temptable)
}
}
}
# Save Final table
print("Saving master table")
fwrite(mastertable, paste0(out_directory, out_filename))
rm(mastertable)
print(Sys.time()-starttime)
After running this scrip the below is the error message i receive.
Error in gsum(INV_ITEM_ID) :
Type 'character' not supported by GForce sum (gsum). Either add the prefix base::sum(.) or turn off GForce optimization using options(datatable.optimize=1)
Here is the general approach with some generic data.
library(tidyverse)
library(lubridate)
data.frame(date = seq(as.Date('2010-01-12'), as.Date('2018-02-03'), by = 100),
var = runif(30)) %>%
group_by(quarter(date, with_year = T)) %>%
summarize(average_var = mean(var))
you can leave out the "with_year = T" if you don't care about the differences between years.

Resources