Combining multiple data sets in R - r

I am a complete beginner in R/ programming language. Right now I am trying to process hundreds of comma separated data files using R. For time series analyses, I need to concatenate the data sets sequentially. Unfortunately, the data files do not have a designated column with time stamp and have some header lines. For that, I am parsing the file creation time from the second line of the data file and adding timesteps based on the sampling frequency which can be found in the third line of the data file. Also, the sampling frequency will vary from files to files that can be identified from the regex patterns in filename. The first three header lines look like this:
SPU1 Monitor Data File
SPU Data Filename = 06Aug2021 ,07 -08 -28,s1c1h17.txt
Sample Frequency = 1
Or
SPU1 Traffic Data File
SPU Data Filename = 05Aug2021 ,02 -48 -14,s1c1p2311.txt
Sample Frequency = 20
I have tried the for loop as well as the lapply. When I am trying the for loop, the script only run once. When I am trying the lapply, I am getting the following message. What am I doing wrong?
[Error in file(file, "rt") : invalid 'description' argument
In addition: Warning messages:
1: In n.readLines(paste(filenames\[i\], sep = ","), header = FALSE, n = 1, :
file doesn't exist
2: In n.readLines(paste(filenames\[i\], sep = ",|\\s|-"), header = FALSE, :
file doesn't exist
Called from: file(file, "rt")][1]
Here is the code I am trying:
setwd("C:/Users/rottweiller/Desktop/Practicing R")
filenames <- list.files(path="C:/Users/rottweiller/Desktop/Practicing R", pattern="c1h|c1p", full.names=FALSE)
library(reader)
library(readr)
library(tidyverse)
AddTS <- function(filenames){
#frq1 <- parse_number(n.readLines(paste(filenames[i], sep = ","), header = FALSE, n = 1, skip = 2))
frq1 <- as.integer(gsub("\\D", "", n.readLines(paste(filenames[i], sep = ","), header = FALSE, n = 1, skip = 2)))
TL1 <- n.readLines(paste(filenames[i], sep = ",|\\s|-"), header = FALSE, n = 1, skip = 1)
SUTC1 <- lubridate::parse_date_time(gsub("\\s-|\\s", "",
stringr::str_extract(TL1, "[SPU Data Filename = ]?\\d{2}\\D{3}\\d{4}\\s\\,\\d{2}\\s-\\d{2}\\s-\\d{2}")), orders = "dmYHMS")
C1 <- as.data.frame(read.delim(filenames[i], header = FALSE, sep = ",", skip = 79))
C1[] <- lapply(C1, function(j) if(is.numeric(j)) ifelse(is.infinite(j), 0, j) else j)
TS1 <- SUTC1 + (1/frq1)*seq_len(nrow(C1))
Card1 <- cbind(TS1, C1)
}
combined <- dplyr::bind_rows(lapply(filenames, AddTS))
Or
[for(i in 1:length(filenames)){
frq1 <- parse_number(n.readLines(paste(filenames\[i\], sep = ","), header = FALSE, n = 1, skip = 2), trim_ws = TRUE)
TL1 <- n.readLines(paste(filenames\[i\], sep = ",|\\s|-"), header = FALSE, n = 1, skip = 1)
SUTC1 <- lubridate::parse_date_time(gsub("\\s-|\\s", "",
stringr::str_extract(TL1, "\[SPU Data Filename = \]?\\d{2}\\D{3}\\d{4}\\s\\,\\d{2}\\s-\\d{2}\\s-\\d{2}")),
orders = "dmYHMS")
C1 <- as.data.frame(read.delim(filenames\[i\], header = FALSE, sep = ",", skip = 79))
C1\[\] <- lapply(C1, function(j) if(is.numeric(j)) ifelse(is.infinite(j), 0, j) else j)
TS1 <- SUTC1 + (1/frq1)*seq_len(nrow(C1))
Card1 <- cbind(TS1, C1)
}][1]

It's a good starting step that you already know about regular expressions and recent R libraries.
You could do something like this:
purrr::map_dfr(filenames, function(f) {
lines <- readLines(file(f))
frq <- lines[3] %>%
str_replace(".*?(\\d*)$", "\\1") %>%
as.integer()
frq
SUTC <- lines[2] %>%
stringr::str_extract("[SPU Data Filename = ]?\\d{2}\\D{3}\\d{4}\\s\\,\\d{2}\\s-\\d{2}\\s-\\d{2}") %>%
lubridate::parse_date_time(orders = "dmYHMS")
SUTC
C <- lines[(which(lines == "end of text") + 2):length(lines)] %>%
textConnection() %>%
read.delim(header = FALSE, sep = ",") %>%
mutate(across(.fns = ~ if_else(. == Inf, 0, .)))
C
TS <- SUTC + seq_len(nrow(C)) / frq
bind_cols(file = f, TS = TS, C)
})

Related

How can i start this code found on github?

I'm following this code on github and in line 51 i have a problem with option[i,]<- skew.raw why? Said: object "i" not found. Why? What should i put?
It also fails to take values as after starting the get.option function I have NA values.
# Define function for formating/retrieving options data from json obj
get.options = function(symbols, date){
options = matrix(ncol = 11, nrow = length(symbols))
colnames(options) = c('Cl_price', "call_strike",
"call_lastPrice","call_vol","call_openInt", "call_ImpVoli",
"put_strike","put_lastPrice", 'put_vol',"put_openInt", 'put_ImpVoli')
rownames(options) = symbols
for(u in 1:length(symbols)){
s = symbols[u]
d = as.numeric(as.POSIXct(date, origin = '1970-01-01', tz = 'GMT'))
json_file <- sprintf('https://query2.finance.yahoo.com/v7/finance/options/%s?
date=%d&formatted=true&crumb=UNus6VhY1bn&lang=en-US&region=US&corsDomain=finance.yahoo.com',s,d)
json_data <- suppressWarnings(fromJSON(paste(readLines(json_file), collapse = "")))
# CALLS
n = length(json_data$optionChain$result[[1]]$options[[1]]$calls)
if (n < 1) next
calls = matrix(ncol = 6, nrow = n)
for(i in 1:n) calls[,2][i] = json_data$optionChain$result[[1]]$options[[1]]$calls[[i]]$strike$raw
Cl.price = json_data$optionChain$result[[1]]$quote$regularMarketPrice
x <- which.min(abs((calls[,2]/Cl.price) -1))
calls = calls[x,]
calls[1] = Cl.price
calls[3] = json_data$optionChain$result[[1]]$options[[1]]$calls[[x]]$lastPrice$raw
calls[4] = json_data$optionChain$result[[1]]$options[[1]]$calls[[x]]$volume$raw
calls[5] = json_data$optionChain$result[[1]]$options[[1]]$calls[[x]]$openInterest$raw
calls[6] = json_data$optionChain$result[[1]]$options[[1]]$calls[[x]]$impliedVolatility$raw
# PUTS
n = length(json_data$optionChain$result[[1]]$options[[1]]$puts)
if(n < 1) next
puts = matrix(ncol = 5, nrow = n)
for(i in 1:n) puts[,1][i] = json_data$optionChain$result[[1]]$options[[1]]$puts[[i]]$strike$raw
x <- which.min(abs((puts[,1]/Cl.price) - 0.95))
puts = puts[x,]
puts[2] = json_data$optionChain$result[[1]]$options[[1]]$puts[[x]]$lastPrice$raw
puts[3] = json_data$optionChain$result[[1]]$options[[1]]$puts[[x]]$volume$raw
puts[4] = json_data$optionChain$result[[1]]$options[[1]]$puts[[x]]$openInterest$raw
puts[5] = json_data$optionChain$result[[1]]$options[[1]]$puts[[x]]$impliedVolatility$raw
options[u,] = c(calls, puts)
}
return(options)
}
# Define stocks and gather options data
date = '2017-04-21'
symbols <- c('DIS','CAT','TSLA')
daily.options = as.data.frame(get.options(symbols, date))
which(is.na(daily.options))
skew.raw = daily.options$put_ImpVoli - daily.options$call_ImpVoli # SKEW(i,t)
options[i,] <- skew.raw
write.table(options, 'DISCATTSLA', sep = ",")
options = read.table('DISCATTSLA', sep = ",")**
I’m following this code because I read the paper by Rhui Zhao but in the paper I did not talk about how to implement the skew volatility on a software and then I was able to find this code on github.

Demography package issue with aggregating data

# Function to construct a mortality demogdata object from HMD
hmd.mx <- function(country, username, password, label=country){
path <- paste("https://www.mortality.org/hmd/", country, "/STATS/", "Mx_1x1.txt", sep = "")
}
userpwd <- paste(username, ":", password, sep = "")
txt <- RCurl::getURL(path, userpwd = userpwd)
con <- textConnection(txt)
mx <- try(utils::read.table(con, skip = 2, header = TRUE, na.strings = "."),TRUE)
close(con)
if(class(mx)=="try-error")
stop("Connection error at www.mortality.org. Please check username, password and country label.")
path <- paste("https://www.mortality.org/hmd/", country, "/STATS/", "Exposures_1x1.txt", sep = "")
userpwd <- paste(username, ":", password, sep = "")
txt <- RCurl::getURL(path, userpwd = userpwd)
con <- textConnection(txt)
pop <- try(utils::read.table(con, skip = 2, header = TRUE, na.strings = "."),TRUE)
close(con)
if(class(pop)=="try-error")
stop("Exposures file not found at www.mortality.org")
obj <- list(type="mortality",label=label,lambda=0)
obj$year <- sort(unique(mx[, 1]))
#obj$year <- ts(obj$year, start=min(obj$year))
n <- length(obj$year)
m <- length(unique(mx[, 2]))
obj$age <- mx[1:m, 2]
obj$rate <- obj$pop <- list()
for (i in 1:n.mort)
{ obj$rate[[i]] <- matrix(mx[, i + 2], nrow = m, ncol = n)
obj$rate[[i]][obj$rate[[i]] < 0] <- NA
obj$pop[[i]] <- matrix(pop[, i + 2], nrow = m, ncol = n)
obj$pop[[i]][obj$pop[[i]] < 0] <- NA
dimnames(obj$rate[[i]]) <- dimnames(obj$pop[[i]]) <- list(obj$age, obj$year)
}
names(obj$pop) = names(obj$rate) <- tolower(mnames)
obj$age <- as.numeric(as.character(obj$age))
if (is.na(obj$age[m])) {
obj$age[m] <- 2 * obj$age[m - 1] - obj$age[m - 2] }
return(structure(obj, class = "demogdata"))
}
Above is the code that we are using to import our population data into r.
NLdata <- hmd.mx(country = "NLD",username = "username",password="password")
This would be the specific code to obtain the Dutch data.
Would anyone happen to know how to add multiple countries into one, and put that data into one dataframe (same format as the demography data packages that we download)? So for example the mortality rates for the (Netherlands + France + Norway) / 3 into one package.
You can try this code. However I could not run your demography package. So you might need to edit the code a bit. Perhaps someone else can fill in the second part? I saw that no one has reacted yet.
C1 <- data.frame(Year = 1980:2018, value1 = rnorm(39), value2 = rnorm(39), Cat =rbinom(39,1,0.5), Country = "France")
C2 <- data.frame(Year = 1980:2018, value1 = rnorm(39), value2 = rnorm(39), Cat =rbinom(39,1,0.5),Country = "England")
C3 <- data.frame(Year = 1970:2018, value1 = rnorm(49), value2 = rnorm(49), Cat =rbinom(49,1,0.5),Country = "Netherlands")
C1 <- split(C1, C1$Cat)
C2 <- split(C2, C2$Cat)
C3 <- split(C3, C3$Cat)
list_all <- list(rbind(C1[[1]],C2[[1]],C3[[1]]),rbind(C1[[2]],C2[[2]],C3[[2]]))
Final_list <- lapply(list_all, function(x) x %>% group_by(Year) %>% summarise(Val1 = mean(value1), Val2 = mean(value2), Country = "All") %>% as.data.frame)

Loop URL R too many open files

I have the following file with URLs in it. The Idea is to download image from URL, get a 6 color palette, get the color names and percentages and bind them all together in a list alongside product number. But I get the "too many files" error.
library(readxl)
library(jpeg)
library(scales)
library(plotrix)
library(gridExtra)
library(dplyr)
library(data.table)
dataset = read_excel("C:/Temp/Product.xlsx", sheet = "All")
datalist = list()
nRowsDf <- nrow(dataset)
avector <- as.vector(dataset$URL)
varenummer <- as.vector(dataset$Varenr)
for (i in 1:nRowsDf) {
tryCatch({
#Convert this from Data.frame to Vector
Sku <- as.vector(varenummer[[i]])
download.file(avector[[i]], paste(Sku,".jpg" ,sep = ""), mode = "wb")
painting <- readJPEG(paste(Sku,".jpg" ,sep = ""))
dimension <- dim(painting)
painting_rgb <- data.frame(
x = rep(1:dimension[2], each = dimension[1]),
y = rep(dimension[1]:1, dimension[2]),
R = as.vector(painting[,, 1]), #slicing array into RGB Channels
G = as.vector(painting[,, 2]),
B = as.vector(painting[,, 3])
)
k_means = kmeans(painting_rgb[, c("R", "G", "B")], algorithm = "Lloyd", centers = 6, iter.max = 300)
test = (sapply(rgb(k_means$centers), color.id))
Color = lapply(test, `[[`, 1)
Values = k_means$size
Percentage = k_means$size / sum(k_means$size)
Final = do.call(rbind, Map(data.frame, Color = lapply(test, `[[`, 1), Values = k_means$size, ProductNumber = Sku, Percentage = Percentage))
Final$i <- i # iteration
datalist[[i]] <- Final # add iteration to list
big_data = rbindlist(datalist)
#grid.table(big_data)
write.table(big_data, file = "myDF.csv", sep = ",", col.names = TRUE, append = TRUE)
#R = Final[with(Final, order(-Percentage)),]
}, error = function(e) { closeAllConnections() })
closeAllConnections()
}
Code stops after downloading around 266 unique JPEG images.
This code downloads only JPG files, if another file type is return it will simply ignore it.
Error :
Error in file(file, ifelse(append, "a", "w")) :
cannot open the connection
In addition: Warning message:
In file(file, ifelse(append, "a", "w")) :
cannot open file 'myDF.csv': Too many open files
If I remove the trycatch i get these:
Error in download.file(avector[[i]], "image.jpg", mode = "wb") :
cannot open destfile 'image.jpg', reason 'Too many open files'
The code had an error or better said an unnecessary step, that keep open connections until it reach the limit impose by "file".
By simply removing the iteration steps and rbind datalist, it run flawless.
Below the modified version.
for (i in 1:nRowsDf) {
tryCatch({
#Convert this from Data.frame to Vector
Sku <- as.vector(varenummer[[i]]) #for testing use 23406
download.file(avector[[i]], paste(Sku, ".jpg", sep = ""), mode = "wb")
# painting <- readJPEG(paste(Sku,".jpg" ,sep = ""))
painting = load.image(paste(Sku, ".jpg", sep = ""))
dimension <- dim(painting)
painting_rgb <- data.frame(
x = rep(1:dimension[2], each = dimension[1]),
y = rep(dimension[1]:1, dimension[2]),
R = as.vector(painting[,, 1]), #slicing our array into three
G = as.vector(painting[,, 2]),
B = as.vector(painting[,, 3])
)
k_means = kmeans(painting_rgb[, c("R", "G", "B")], algorithm = "Lloyd", centers = 6, iter.max = 300)
test = (sapply(rgb(k_means$centers), color.id))
Color = lapply(test, `[[`, 1)
Values = k_means$size
Percentage = k_means$size / sum(k_means$size)
Final = do.call(rbind, Map(data.frame, Color = lapply(test, `[[`, 1), Values = k_means$size, ProductNumber = Sku, Percentage = Percentage))
#Final$i <- i # maybe you want to keep track of which iteration produced it?
#datalist[[i]] <- Final # add it to your list
#big_data = rbindlist(datalist)
#grid.table(big_data)
write.table(Final, file = "myDF.csv", sep = ",", col.names = TRUE, append = TRUE)
#R = Final[with(Final, order(-Percentage)),]
}, error = function(e) { closeAllConnections() })
closeAllConnections()
}

Adding different .csv files using loop and rbind in R (first time in R)

So I have a bunch of different .csv files that I'd like to merge into a single data frame so I can run the code table at the end. What I'm trying to do is so it checks the name of the file [V(x)] so that if x < 7, then it merges [V(2:7)] together. I've found similar questions, but I can't really figure out how to put the condition for the [V(x)]. Here's what I have so far, and I'm not getting the right number of observations.
#Import data from all versions of survey
setwd("~09 Export")
#Change the following when new versions are added
allVer <- c(2, 3, 4, 5, 6, 7)
curVer <- 7
oldVer <- c(2, 3, 4, 5, 6)
#loop
dta <- data.frame()
for (i in allVer){
dta <- rbind(dta, read.csv(paste("NH School Choice Survey [V", i, "]", ".csv", sep = "")), header = TRUE, stringsAsFactors = FALSE, fill = TRUE)
if (i < curVer){
dta <- rbind(dta, read.csv(paste("NH School Choice Survey [V", i, "]", ".csv", sep = "")), header = TRUE, stringsAsFactors = FALSE, fill = TRUE)
}
}
for (i in oldVer){
dta <- rbind(dta, read.csv(paste("NH School Choice Survey [V", i, "]", ".csv", sep = "")), header = TRUE, stringsAsFactors = FALSE, fill = TRUE)
}
dta$completed <- dta$anyStudent == 1
table(dta$idSurveyor, dta$completed)

R: Running row-wise operations between data frames

I'd like to run a statistical test, row-by-matching-row, between two data frames gex and mxy. The catch is that I need to run it several times, each time using a different column from gex, yielding a different vector of test results for each run.
Here is what I have so far (using example values), after much help from #kristang.
gex <- data.frame("sample" = c(987,7829,15056,15058,15072),
"TCGA-F4-6703-01" = runif(5, -1, 1),
"TCGA-DM-A28E-01" = runif(5, -1, 1),
"TCGA-AY-6197-01" = runif(5, -1, 1),
"TCGA-A6-5657-01" = runif(5, -1, 1))
colnames(gex) <- gsub("[.]", "_",colnames(gex))
listx <- c("TCGA_DM_A28E_01","TCGA_A6_5657_01")
mxy <- data.frame("TCGA-AD-6963-01" = runif(5, -1, 1),
"TCGA-AA-3663-11" = runif(5, -1, 1),
"TCGA-AD-6901-01" = runif(5, -1, 1),
"TCGA-AZ-2511-01" = runif(5, -1, 1),
"TCGA-A6-A567-01" = runif(5, -1, 1))
colnames(mxy) <- gsub("[.]", "_",colnames(mxy))
zScore <- function(x,y)((as.numeric(x) - as.numeric(rowMeans(y,na.rm=T)))/as.numeric(sd(y,na.rm=T)))
## BELOW IS FOR DIAGNOSTICS
write.table(mxy, file = "mxy.csv",
row.names=FALSE, col.names=TRUE, sep=",", quote=F)
write.table(gex, file = "gex.csv",
row.names=FALSE, col.names=TRUE, sep=",", quote=F)
## ABOVE IS FOR DIAGNOSTICS
for(i in seq(nrow(mxy)))
for(colName in listx){
zvalues <- zScore(gex[,colName[colName %in% names(gex)]],
mxy[i,])
## BELOW IS FOR DIAGNOSTICS
write.table(gex[,colName[colName %in% names(gex)]], file=paste0(colName, "column", ".csv"),
row.names=FALSE,col.names=FALSE,sep=",",quote=F)
write.table(mxy[i,], file=paste0(colName, "mxyinput", ".csv"),
row.names=FALSE,col.names=FALSE,sep=",",quote=F)
## ABOVE IS FOR DIAGNOSTICS
geneexptest <- data.frame(gex$sample, zvalues, row.names = NULL,
stringsAsFactors = FALSE)
write.csv(geneexptest, file = paste0(colName, ".csv"),
row.names=FALSE, col.names=FALSE, sep=",", quote=F)
}
The problem is that while it seems to go through and create the correct number of output files with the correct number of rows, etc...but it does not yield correct z-scores. I want it to calculate:
((Value from row z & given column of gex) - (Mean of values in row z across mxy)) / (Standard deviation of values in row z across mxy)
Then move on to the next row, and so on, filling in the first vector. THEN, I want it to calculate the same thing using the next column of gex, filling in a separate vector. I hope this makes sense.
I have a separate script which runs the same test using a pre-determined column vs the other data frame. The relevant for loop from that script looks like this:
for(i in seq_along(mxy)){
zvalues[i] <- (gex_column_W[i] - mean(mxy[i,])) / sd(mxy[i,])
}
I think there may be a typo in your code, specifically you say you want "Mean of values in row z across mxy" but are using the mean(mxy[,i])) which selects the i'th column, not the i'th row. I re-wrote this section with for loops for clarity. (not sure why you were using lapply?)
# a function fo calculationg the z score
zScore <- function(x,y)(x - mean(y,na.rm=T))/sd(y,na.rm=T)
for(i in seq(nrow(mxy))) # note that length(mxy) is actually the number of columns in mxy
for(colName in listx){
zvalues <- zScore(gex[,colName],# column == colName
mxy[i,])# row == i
geneexptest <- data.frame(gex$sample, zvalues, row.names = NULL,
stringsAsFactors = FALSE)
write.table(geneexptest, file = paste0(colName, "mxyinput", ".csv"),
row.names=FALSE, col.names=FALSE, quote=F,
sep = ",", dec = ".", append=(i > 1))
}
and alternative that does not rely on append:
for(colName in listx){
geneexptest <- NULL
for(i in seq(nrow(mxy))) {
zvalues <- zScore(gex[,colName],# column == colName
mxy[i,])# row == i
geneexptest <- rbind(geneexptest,
data.frame(gex$sample, zvalues, row.names = NULL,
stringsAsFactors = FALSE))
}
write.table(geneexptest, file = paste0(colName, "mxyinput", ".csv"),
row.names=FALSE, col.names=FALSE, quote=F,
sep = ",", dec = ".", append=(i > 1))
}

Resources