Loop URL R too many open files - r

I have the following file with URLs in it. The Idea is to download image from URL, get a 6 color palette, get the color names and percentages and bind them all together in a list alongside product number. But I get the "too many files" error.
library(readxl)
library(jpeg)
library(scales)
library(plotrix)
library(gridExtra)
library(dplyr)
library(data.table)
dataset = read_excel("C:/Temp/Product.xlsx", sheet = "All")
datalist = list()
nRowsDf <- nrow(dataset)
avector <- as.vector(dataset$URL)
varenummer <- as.vector(dataset$Varenr)
for (i in 1:nRowsDf) {
tryCatch({
#Convert this from Data.frame to Vector
Sku <- as.vector(varenummer[[i]])
download.file(avector[[i]], paste(Sku,".jpg" ,sep = ""), mode = "wb")
painting <- readJPEG(paste(Sku,".jpg" ,sep = ""))
dimension <- dim(painting)
painting_rgb <- data.frame(
x = rep(1:dimension[2], each = dimension[1]),
y = rep(dimension[1]:1, dimension[2]),
R = as.vector(painting[,, 1]), #slicing array into RGB Channels
G = as.vector(painting[,, 2]),
B = as.vector(painting[,, 3])
)
k_means = kmeans(painting_rgb[, c("R", "G", "B")], algorithm = "Lloyd", centers = 6, iter.max = 300)
test = (sapply(rgb(k_means$centers), color.id))
Color = lapply(test, `[[`, 1)
Values = k_means$size
Percentage = k_means$size / sum(k_means$size)
Final = do.call(rbind, Map(data.frame, Color = lapply(test, `[[`, 1), Values = k_means$size, ProductNumber = Sku, Percentage = Percentage))
Final$i <- i # iteration
datalist[[i]] <- Final # add iteration to list
big_data = rbindlist(datalist)
#grid.table(big_data)
write.table(big_data, file = "myDF.csv", sep = ",", col.names = TRUE, append = TRUE)
#R = Final[with(Final, order(-Percentage)),]
}, error = function(e) { closeAllConnections() })
closeAllConnections()
}
Code stops after downloading around 266 unique JPEG images.
This code downloads only JPG files, if another file type is return it will simply ignore it.
Error :
Error in file(file, ifelse(append, "a", "w")) :
cannot open the connection
In addition: Warning message:
In file(file, ifelse(append, "a", "w")) :
cannot open file 'myDF.csv': Too many open files
If I remove the trycatch i get these:
Error in download.file(avector[[i]], "image.jpg", mode = "wb") :
cannot open destfile 'image.jpg', reason 'Too many open files'

The code had an error or better said an unnecessary step, that keep open connections until it reach the limit impose by "file".
By simply removing the iteration steps and rbind datalist, it run flawless.
Below the modified version.
for (i in 1:nRowsDf) {
tryCatch({
#Convert this from Data.frame to Vector
Sku <- as.vector(varenummer[[i]]) #for testing use 23406
download.file(avector[[i]], paste(Sku, ".jpg", sep = ""), mode = "wb")
# painting <- readJPEG(paste(Sku,".jpg" ,sep = ""))
painting = load.image(paste(Sku, ".jpg", sep = ""))
dimension <- dim(painting)
painting_rgb <- data.frame(
x = rep(1:dimension[2], each = dimension[1]),
y = rep(dimension[1]:1, dimension[2]),
R = as.vector(painting[,, 1]), #slicing our array into three
G = as.vector(painting[,, 2]),
B = as.vector(painting[,, 3])
)
k_means = kmeans(painting_rgb[, c("R", "G", "B")], algorithm = "Lloyd", centers = 6, iter.max = 300)
test = (sapply(rgb(k_means$centers), color.id))
Color = lapply(test, `[[`, 1)
Values = k_means$size
Percentage = k_means$size / sum(k_means$size)
Final = do.call(rbind, Map(data.frame, Color = lapply(test, `[[`, 1), Values = k_means$size, ProductNumber = Sku, Percentage = Percentage))
#Final$i <- i # maybe you want to keep track of which iteration produced it?
#datalist[[i]] <- Final # add it to your list
#big_data = rbindlist(datalist)
#grid.table(big_data)
write.table(Final, file = "myDF.csv", sep = ",", col.names = TRUE, append = TRUE)
#R = Final[with(Final, order(-Percentage)),]
}, error = function(e) { closeAllConnections() })
closeAllConnections()
}

Related

How can i start this code found on github?

I'm following this code on github and in line 51 i have a problem with option[i,]<- skew.raw why? Said: object "i" not found. Why? What should i put?
It also fails to take values as after starting the get.option function I have NA values.
# Define function for formating/retrieving options data from json obj
get.options = function(symbols, date){
options = matrix(ncol = 11, nrow = length(symbols))
colnames(options) = c('Cl_price', "call_strike",
"call_lastPrice","call_vol","call_openInt", "call_ImpVoli",
"put_strike","put_lastPrice", 'put_vol',"put_openInt", 'put_ImpVoli')
rownames(options) = symbols
for(u in 1:length(symbols)){
s = symbols[u]
d = as.numeric(as.POSIXct(date, origin = '1970-01-01', tz = 'GMT'))
json_file <- sprintf('https://query2.finance.yahoo.com/v7/finance/options/%s?
date=%d&formatted=true&crumb=UNus6VhY1bn&lang=en-US&region=US&corsDomain=finance.yahoo.com',s,d)
json_data <- suppressWarnings(fromJSON(paste(readLines(json_file), collapse = "")))
# CALLS
n = length(json_data$optionChain$result[[1]]$options[[1]]$calls)
if (n < 1) next
calls = matrix(ncol = 6, nrow = n)
for(i in 1:n) calls[,2][i] = json_data$optionChain$result[[1]]$options[[1]]$calls[[i]]$strike$raw
Cl.price = json_data$optionChain$result[[1]]$quote$regularMarketPrice
x <- which.min(abs((calls[,2]/Cl.price) -1))
calls = calls[x,]
calls[1] = Cl.price
calls[3] = json_data$optionChain$result[[1]]$options[[1]]$calls[[x]]$lastPrice$raw
calls[4] = json_data$optionChain$result[[1]]$options[[1]]$calls[[x]]$volume$raw
calls[5] = json_data$optionChain$result[[1]]$options[[1]]$calls[[x]]$openInterest$raw
calls[6] = json_data$optionChain$result[[1]]$options[[1]]$calls[[x]]$impliedVolatility$raw
# PUTS
n = length(json_data$optionChain$result[[1]]$options[[1]]$puts)
if(n < 1) next
puts = matrix(ncol = 5, nrow = n)
for(i in 1:n) puts[,1][i] = json_data$optionChain$result[[1]]$options[[1]]$puts[[i]]$strike$raw
x <- which.min(abs((puts[,1]/Cl.price) - 0.95))
puts = puts[x,]
puts[2] = json_data$optionChain$result[[1]]$options[[1]]$puts[[x]]$lastPrice$raw
puts[3] = json_data$optionChain$result[[1]]$options[[1]]$puts[[x]]$volume$raw
puts[4] = json_data$optionChain$result[[1]]$options[[1]]$puts[[x]]$openInterest$raw
puts[5] = json_data$optionChain$result[[1]]$options[[1]]$puts[[x]]$impliedVolatility$raw
options[u,] = c(calls, puts)
}
return(options)
}
# Define stocks and gather options data
date = '2017-04-21'
symbols <- c('DIS','CAT','TSLA')
daily.options = as.data.frame(get.options(symbols, date))
which(is.na(daily.options))
skew.raw = daily.options$put_ImpVoli - daily.options$call_ImpVoli # SKEW(i,t)
options[i,] <- skew.raw
write.table(options, 'DISCATTSLA', sep = ",")
options = read.table('DISCATTSLA', sep = ",")**
I’m following this code because I read the paper by Rhui Zhao but in the paper I did not talk about how to implement the skew volatility on a software and then I was able to find this code on github.

Combining multiple data sets in R

I am a complete beginner in R/ programming language. Right now I am trying to process hundreds of comma separated data files using R. For time series analyses, I need to concatenate the data sets sequentially. Unfortunately, the data files do not have a designated column with time stamp and have some header lines. For that, I am parsing the file creation time from the second line of the data file and adding timesteps based on the sampling frequency which can be found in the third line of the data file. Also, the sampling frequency will vary from files to files that can be identified from the regex patterns in filename. The first three header lines look like this:
SPU1 Monitor Data File
SPU Data Filename = 06Aug2021 ,07 -08 -28,s1c1h17.txt
Sample Frequency = 1
Or
SPU1 Traffic Data File
SPU Data Filename = 05Aug2021 ,02 -48 -14,s1c1p2311.txt
Sample Frequency = 20
I have tried the for loop as well as the lapply. When I am trying the for loop, the script only run once. When I am trying the lapply, I am getting the following message. What am I doing wrong?
[Error in file(file, "rt") : invalid 'description' argument
In addition: Warning messages:
1: In n.readLines(paste(filenames\[i\], sep = ","), header = FALSE, n = 1, :
file doesn't exist
2: In n.readLines(paste(filenames\[i\], sep = ",|\\s|-"), header = FALSE, :
file doesn't exist
Called from: file(file, "rt")][1]
Here is the code I am trying:
setwd("C:/Users/rottweiller/Desktop/Practicing R")
filenames <- list.files(path="C:/Users/rottweiller/Desktop/Practicing R", pattern="c1h|c1p", full.names=FALSE)
library(reader)
library(readr)
library(tidyverse)
AddTS <- function(filenames){
#frq1 <- parse_number(n.readLines(paste(filenames[i], sep = ","), header = FALSE, n = 1, skip = 2))
frq1 <- as.integer(gsub("\\D", "", n.readLines(paste(filenames[i], sep = ","), header = FALSE, n = 1, skip = 2)))
TL1 <- n.readLines(paste(filenames[i], sep = ",|\\s|-"), header = FALSE, n = 1, skip = 1)
SUTC1 <- lubridate::parse_date_time(gsub("\\s-|\\s", "",
stringr::str_extract(TL1, "[SPU Data Filename = ]?\\d{2}\\D{3}\\d{4}\\s\\,\\d{2}\\s-\\d{2}\\s-\\d{2}")), orders = "dmYHMS")
C1 <- as.data.frame(read.delim(filenames[i], header = FALSE, sep = ",", skip = 79))
C1[] <- lapply(C1, function(j) if(is.numeric(j)) ifelse(is.infinite(j), 0, j) else j)
TS1 <- SUTC1 + (1/frq1)*seq_len(nrow(C1))
Card1 <- cbind(TS1, C1)
}
combined <- dplyr::bind_rows(lapply(filenames, AddTS))
Or
[for(i in 1:length(filenames)){
frq1 <- parse_number(n.readLines(paste(filenames\[i\], sep = ","), header = FALSE, n = 1, skip = 2), trim_ws = TRUE)
TL1 <- n.readLines(paste(filenames\[i\], sep = ",|\\s|-"), header = FALSE, n = 1, skip = 1)
SUTC1 <- lubridate::parse_date_time(gsub("\\s-|\\s", "",
stringr::str_extract(TL1, "\[SPU Data Filename = \]?\\d{2}\\D{3}\\d{4}\\s\\,\\d{2}\\s-\\d{2}\\s-\\d{2}")),
orders = "dmYHMS")
C1 <- as.data.frame(read.delim(filenames\[i\], header = FALSE, sep = ",", skip = 79))
C1\[\] <- lapply(C1, function(j) if(is.numeric(j)) ifelse(is.infinite(j), 0, j) else j)
TS1 <- SUTC1 + (1/frq1)*seq_len(nrow(C1))
Card1 <- cbind(TS1, C1)
}][1]
It's a good starting step that you already know about regular expressions and recent R libraries.
You could do something like this:
purrr::map_dfr(filenames, function(f) {
lines <- readLines(file(f))
frq <- lines[3] %>%
str_replace(".*?(\\d*)$", "\\1") %>%
as.integer()
frq
SUTC <- lines[2] %>%
stringr::str_extract("[SPU Data Filename = ]?\\d{2}\\D{3}\\d{4}\\s\\,\\d{2}\\s-\\d{2}\\s-\\d{2}") %>%
lubridate::parse_date_time(orders = "dmYHMS")
SUTC
C <- lines[(which(lines == "end of text") + 2):length(lines)] %>%
textConnection() %>%
read.delim(header = FALSE, sep = ",") %>%
mutate(across(.fns = ~ if_else(. == Inf, 0, .)))
C
TS <- SUTC + seq_len(nrow(C)) / frq
bind_cols(file = f, TS = TS, C)
})

How do you restore the result of each iteration of a for loop into its own matrix?

I have a for loop that takes each sample file on a list, creates a matrix for that sample, and then stores it into one big list of all the sample matrices.
Here is what I have done so far:
# load in data ------------------------------------------------------------------
filePaths = getGEOSuppFiles("GSE124395")
tarF <- list.files(path = "./GSE124395/", pattern = "*.tar", full.names = TRUE)
untar(tarF, exdir = "./GSE124395/")
gzipF <- list.files(path = "./GSE124395/", pattern = "*.gz", full.names = TRUE)
ldply(.data = gzipF, .fun = gunzip)
#running test loop -------------------------------------------------------------
testlist <- c("./GSE124395//GSM3531672_P301_3_CRYOMIXED11.coutt.csv",
"./GSE124395//GSM3531673_P301_4_CRYOMIXED12.coutt.csv",
"./GSE124395//GSM3531674_P301_5_HEP1_1_5.coutt.csv")
LoopList_test <- list()
for (i in 1:length(testlist)){
matrix_test <- read.delim(file =testlist[i])
matrix_test <- data.frame(matrix_test[,-1], row.names=matrix_test[,1])
matrix_test <- as.matrix(matrix_test) #<- makes the excel file into a matrix
colname_test <- read.delim(file =testlist[i])
colname_test <- read.table(file = './GSE124395//GSE124395_celseq_barcodes.192.txt', header = FALSE, row.names = 1)
colname_test <- data.frame(colname_test[,-1], col=colname_test[,1])
colname_test <- as.matrix(colname_test)
colnames(matrix_test) <- colname_test[,1]
LoopList_test[[i]]<-matrix_test
}
This is the output:
part of output in the one big list
I would like the loop to store the result of each iteration into its own matrix, so I have multiple matrices instead of one giant list of matrices, if that makes sense. I think it involves either splitting this one giant list into sublists, or storing the results of the loop into a matrix/array/vector instead of a list, or somehow having it store each iteration into its own variable within the loop. I'm not sure how to go about doing any of those.
Thanks for reading!
UPDATE:
So the whole point of this was to create matrices to then combine them into one matrix. Then turn this one matrix into a Seurat object which I could then perform clustering on.
So here is what I have done so far: essentially, I made multiple loops of each group within the dataset, added whatever information I needed, and then took the list and the function I think I need actually takes a list so that's good for me. Here's the code I decided on at the moment:
mylist<-list.files(path = "./GSE124395/", pattern = "\\.csv$",full.names = TRUE)
LoopList <- list()
for (i in 1:30){
matrix_input <- read.delim(file =mylist[i])
matrix_input <- data.frame(matrix_input[,-1], row.names=matrix_input[,1])
matrix_input <- as.matrix(matrix_input) #<- makes the excel file into a matrix
colname_input <- read.delim(file =mylist[i])
colname_input <- read.table(file = './GSE124395//GSE124395_celseq_barcodes.192.txt', header = FALSE, row.names = 1)
colname_input <- data.frame(colname_input[,-1], col=colname_input[,1])
colname_input <- as.matrix(colname_input)
colnames(matrix_input) <- colname_input[,1]
colnames(matrix_input) <- paste(colnames(matrix_input), "Colorectal_Metastasis", sep = "_")
P301_pdat <- data.frame("samples" = colnames(matrix_input), "treatment" = "Colorectal_Metastasis")
sobj <- CreateSeuratObject(counts = matrix_input, min.cells = 0, min.features = 1,
project = "Patient301_Colorectal_Metastasis")
LoopList[[i]]<-sobj
#LoopList <- assign(paste0("Patient301", i), sobj )
}
# P304 loop -------------------------------------------------------------------------
for (i in 31:56){
matrix_input <- read.delim(file =mylist[i])
matrix_input <- data.frame(matrix_input[,-1], row.names=matrix_input[,1])
matrix_input <- as.matrix(matrix_input) #<- makes the excel file into a matrix
colname_input <- read.delim(file =mylist[i])
colname_input <- read.table(file = './GSE124395//GSE124395_celseq_barcodes.192.txt', header = FALSE, row.names = 1)
colname_input <- data.frame(colname_input[,-1], col=colname_input[,1])
colname_input <- as.matrix(colname_input)
colnames(matrix_input) <- colname_input[,1]
colnames(matrix_input) <- paste(colnames(matrix_input), "Colorectal_Metastasis", sep = "_")
P304_pdat <- data.frame("samples" = colnames(matrix_input), "treatment" = "Colorectal_Metastasis")
sobj <- CreateSeuratObject(counts = matrix_input, min.cells = 0, min.features = 1,
project = "Patient304_Colorectal_Metastasis")
LoopList[[i]]<-sobj
}
and so on. Then, following https://satijalab.org/seurat/articles/integration_large_datasets.html
sobj.list <- SplitObject(LoopList, split.by = "orig.ident")
joined <- lapply(X = LoopList, FUN = function(x) {
x <- NormalizeData(x, verbose = FALSE)
x <- FindVariableFeatures(x, verbose = FALSE)
})
features <- SelectIntegrationFeatures(object.list = joined)
joined <- lapply(X = joined, FUN = function(x) {
x <- ScaleData(x, features = features, verbose = FALSE)
x <- RunPCA(x, features = features, verbose = FALSE)
})
anchors <- FindIntegrationAnchors(object.list = joined, reduction = "rpca",
dims = 1:50)
joined.integrated <- IntegrateData(anchorset = anchors, dims = 1:50)
joined.integrated <- ScaleData(joined.integrated, verbose = FALSE)
joined.integrated <- RunPCA(joined.integrated, verbose = FALSE)
joined.integrated <- RunUMAP(joined.integrated, dims = 1:50)
DimPlot(joined.integrated, group.by = "orig.ident")
DimPlot(joined.integrated, reduction = "umap", split.by = "treatment")
I don't know if this works for sure, but I thought I would update this question to reflect what I've learned so far! I guess lesson I've learned is see if you can find a function that takes a list as input heh.

snapPointsToLines can't keep attributes in R

I recently find a problem of snapPointsToLines. It can't keep the attributes of the spatial point dataframe. The example is as below:
# Generate a spatial line dataframe
l1 = cbind(c(1,2,3),c(3,2,2))
l1a = cbind(l1[,1]+.05,l1[,2]+.05)
l2 = cbind(c(1,2,3),c(1,1.5,1))
Sl1 = Line(l1)
Sl1a = Line(l1a)
Sl2 = Line(l2)
S1 = Lines(list(Sl1, Sl1a), ID="a")
S2 = Lines(list(Sl2), ID="b")
Sl = SpatialLines(list(S1,S2))
df = data.frame(z = c(1,2), row.names=sapply(slot(Sl, "lines"), function(x) slot(x, "ID")))
Sldf = SpatialLinesDataFrame(Sl, data = df)
# Generate a spatial point dataframe
xc = c(1.2,1.5,2.5)
yc = c(1.5,2.2,1.6)
Spoints = SpatialPoints(cbind(xc, yc))
Spdf <- SpatialPointsDataFrame(Spoints, data = data.frame(value = 1:length(Spoints)))
#use the function SpatialPointsDataFrame
res <- snapPointsToLines(Spdf, Sldf)
res only has "nearest_line_id" and "snap_dist". It doesn't have "value" field from Spdf, which I need.
#use the function SpatialPointsDataFrame with "withAttrs = TRUE" parameter
res <- snapPointsToLines(Spdf, Sldf, withAttrs = TRUE)
It reports error:
"Error in snapPointsToLines(Spdf, Sldf, withAttrs = TRUE) :
A SpatialPoints object has no attributes! Please set withAttrs as FALSE."
But Spdf is the spatialpointdataframe with attribute.
I don't know what problem it is. When I used this function several weeks ago, it didn't have this problem.
I think the problem may be due to the function itself. When you look at the codes of this function, we can see the codes at the beginning part as below.
if (class(points) == "SpatialPoints" && missing(withAttrs))
withAttrs = FALSE
if (class(points) == "SpatialPoints" && withAttrs == TRUE)
stop("A SpatialPoints object has no attributes! Please set withAttrs as FALSE.")
Sometimes a SpatialPointsDataFrame could be identified as SpatialPoints. So the function will treat your SpatialPointsDataFrame as SpatialPoints and will not keep the attributes in the function.
You can make a little modification in the the codes of the function as below.
snapPointsToLines1 <- function (points, lines, maxDist = NA, withAttrs = TRUE, idField = NA)
{
if (rgeosStatus()) {
if (!requireNamespace("rgeos", quietly = TRUE))
stop("package rgeos required for snapPointsToLines")
}
else stop("rgeos not installed")
if (is(points, "SpatialPointsDataFrame")==FALSE && missing(withAttrs))
withAttrs = FALSE
if (is(points, "SpatialPointsDataFrame")==FALSE && withAttrs == TRUE)
stop("A SpatialPointsDataFrame object is needed! Please set withAttrs as FALSE.")
d = rgeos::gDistance(points, lines, byid = TRUE)
if (!is.na(maxDist)) {
distToLine <- apply(d, 2, min, na.rm = TRUE)
validPoints <- distToLine <= maxDist
distToPoint <- apply(d, 1, min, na.rm = TRUE)
validLines <- distToPoint <= maxDist
points <- points[validPoints, ]
lines = lines[validLines, ]
d = d[validLines, validPoints, drop = FALSE]
distToLine <- distToLine[validPoints]
if (!any(validPoints)) {
if (is.na(idField)) {
idCol = character(0)
}
else {
idCol = lines#data[, idField][0]
}
newCols = data.frame(nearest_line_id = idCol, snap_dist = numeric(0))
if (withAttrs)
df <- cbind(points#data, newCols)
else df <- newCols
res <- SpatialPointsDataFrame(points, data = df,
proj4string = CRS(proj4string(points)), match.ID = FALSE)
return(res)
}
}
else {
distToLine = apply(d, 2, min, na.rm = TRUE)
}
nearest_line_index = apply(d, 2, which.min)
coordsLines = coordinates(lines)
coordsPoints = coordinates(points)
mNewCoords = vapply(1:length(points), function(x) nearestPointOnLine(coordsLines[[nearest_line_index[x]]][[1]],
coordsPoints[x, ]), FUN.VALUE = c(0, 0))
if (!is.na(idField)) {
nearest_line_id = lines#data[, idField][nearest_line_index]
}
else {
nearest_line_id = sapply(slot(lines, "lines"),
function(i) slot(i, "ID"))[nearest_line_index]
}
if (withAttrs)
df = cbind(points#data, data.frame(nearest_line_id, snap_dist = distToLine))
else df = data.frame(nearest_line_id, snap_dist = distToLine,
row.names = names(nearest_line_index))
SpatialPointsDataFrame(coords = t(mNewCoords), data = df,
proj4string = CRS(proj4string(points)))
}
Then using this new function snapPointsToLines1, you can get the attributes what you want.

How to fix kmeans error in r : 'more cluster centers than distinct data points'

When I run a kmeans algorithm I receive this error :
Error in kmeans(x, 2, 15) :
more cluster centers than distinct data points.
How can this error be fixed and what does it mean ? I think my data points are distinct ?
Here are my files and the r code I am using to generate kmeans :
rnames.csv :
"a1","a2","a3"
cells.csv :
0,1,2,1,4,3,5,3,4
cnames.csv :
"google","so","test"
cells = c(read.csv("c:\\data-files\\kmeans\\cells.csv", header = TRUE))
rnames = c(read.csv("c:\\data-files\\kmeans\\rnames.csv", header = TRUE))
cnames = c(read.csv("c:\\data-files\\kmeans\\cnames.csv", header = TRUE))
x <- matrix(cells, nrow=3, ncol=3, byrow=TRUE, dimnames=list(rnames, cnames))
# run K-Means
km <- kmeans(x, 2, 15)
Fix for this is to use :
cells = c(read.csv("c:\\data-files\\kmeans\\cells.csv", header = FALSE))
rnames = c(read.csv("c:\\data-files\\kmeans\\rnames.csv", header = FALSE))
cnames = c(read.csv("c:\\data-files\\kmeans\\cnames.csv", header = FALSE))
instead of
cells = c(read.csv("c:\\data-files\\kmeans\\cells.csv", header = TRUE))
rnames = c(read.csv("c:\\data-files\\kmeans\\rnames.csv", header = TRUE))
cnames = c(read.csv("c:\\data-files\\kmeans\\cnames.csv", header = TRUE))

Resources