R obs Number Limited - r

I've been stuck with the data here for days, as I want to get data from API Binance, which is surely over ten thousand obs, but the R only limited the obs at 1500L.
I have been advised to use loop, but it doesn't help any.
Any help would be totally my gratitude!
library(httr)
library(jsonlite)
library(lubridate)
# api description:
#
get
("https://github.com/binance-exchange/binance-official-api-docs/blob/master/rest-api.md"
)
#klinecandlestick-data
options(stringsAsFactors = FALSE)
url <- "https://api.binance.com"
path <- "/api/v3/exchangeInfo"
raw.result <- GET(url = url, path = path)
not.cool.data <- rawToChar(raw.result$content)
list1 <- fromJSON(not.cool.data)
list <- list1$symbols$symbol
klines2 <- rbindlist(lapply(
c('LTCTUSD', 'LTCBNB'),
binance_klines,
interval = '30m',
start_time = '2017-01-01',
end_time = '2021-01-08'
))
names(klines2)
sapply(klines2, function(x) length(unique(x)))
klines2
df.1 <- list.files(pattern = "2017-2021")
df.1_r <- vector(mode = integer,
length = length(klines2))
tickling <- unique(klines2$symbol)
tickling
low <- c()
high <- c()
for (symbol in tickling) {
look.at <- klines2$symbol == symbol
low <- append(low,min(symbol$low[look.at]))
high <- append(high, max(symbol$high[look.at]))
}
tickling

Related

R 'differing number of rows' error using OpenWeather API

For an assignment I'm trying to fetch weather forecast data from the OpenWeather API. My code returns this error:
Error in data.frame(city = city, weather = weather, visibility = visibility, :
arguments imply differing number of rows: 40, 1, 0, 12
I clicked to re-run with RStudio's debug and got:
Not sure what's causing this error. Can anyone help me find the problem in my code? Very grateful for any help.
# To create empty vectors to hold data temporarily
city <- c()
weather <- c()
visibility <- c()
temp <- c()
temp_min <- c()
temp_max <- c()
pressure <- c()
humidity <- c()
wind_speed <- c()
wind_deg <- c()
forecast_datetime <- c()
season <- c()
# To get 5-day forecast data for select cities
get_weather_forecast_by_cities <- function(city_names){
df <- data.frame()
for (city_name in city_names){
url_get='https://api.openweathermap.org/data/2.5/forecast'
weather_api_key <- "[*my API key*]"
forecast_query <- list(q = city_name, appid = weather_api_key, units="metric")
response <- GET(url_get, query=forecast_query)
json_list <-content(response, as="parsed")
results <- json_list$list
for(result in results) {
# Get weather data and append them to vectors
city <- c(city, city_name)
}
# Combine into a DF
weather <- c(weather, result$weather[[1]]$main)
visibility <- c(visibility, result$visibility)
temp <- c(temp, result$main$temp)
temp_min <- c(temp_min, result$main$temp_min)
temp_max <- c(temp_max, result$main$temp_max)
pressure <- c(pressure, result$main$pressure)
humidity <- c(humidity, result$main$humidity)
wind_speed <- c(wind_speed, result$wind$speed)
wind_deg <- c(wind_deg, result$wind$deg)
forecast_datetime <- c(forecast_datetime, result$dt_text)
months <- as.numeric(format(as.Date(forecast_datetime), '%m'))
index <- setNames(rep(c('winter', 'spring', 'summer', 'fall'), each=3), c(12,1:11))
season <- unname(index[as.character(months)])
weather_df <- data.frame(city=city, weather=weather, visibility=visibility, temp=temp, temp_min=temp_min, temp_max=temp_max, pressure=pressure, humidity=humidity, wind_deg=wind_deg, forecast_datetime=forecast_datetime, months=months, index=index, season=season)
}
return(df)
}
cities <- c("Seoul", "New York City", "Paris", "London", "Taiyuan")
cities_weather_df <- get_weather_forecast_by_cities(cities)

Arabidopsis Gene ID Conversion (BioMart, CLC Genomics Workbench Output)

I have an output of RNA-seq reads from CLC genomics workbench, for Arabidopsis thaliana. The list of genes contains a mix of gene names (i.e. "TRY", "TMM", "SVP", "FLC"), and IDs (e.g. "AT1G01390", "AT1G01310", "AT1G01240"). I would like to convert them all to gene names, so I can run it through a GO terms R package (the package seemingly does not read IDs like AT1G01390).
When I use biomaRt's getBM() function, it returns a lot less genes than the list of genes I'm reading into it. The original list from CLC has all Arabidopsis genes (27,655) and the outputs from getBM() generally have 12,085 gene names or less.
Anybody done this type of conversion before with success?
Thanks in advance!
I've tried various types of attributes, but none of them have worked.
#data load in and conversions, meta matrix/design creation:
#reads file was created in CLC Genomics Workbench, then the reads column copied and pasted for
#each sample
reads <- as.matrix(read.csv("genereads_ONLY4.txt", sep = '\t', row.names = 1, header = TRUE))
meta <- read.table("metatest4.txt", header = TRUE, fileEncoding= "UTF-16LE")
mart = useMart(biomart="plants_mart",host="plants.ensembl.org")
listDatasets(useMart(biomart="plants_mart",host="plants.ensembl.org"))
ensembl = useDataset("athaliana_eg_gene",mart= mart)
genes <- row.names(reads)
test1 <- getBM(attributes='external_gene_name',
values = genes,
mart = ensembl)
Okay, I found a round about way to solve this, at least for my scenario.
The gmt and fgsea information I'm using can only read gene symbols (e.g. "TRY") or entrez IDs. So I wrote a function to convert all of the information I had to either symbols or entrez IDs. The code is:
reads <- as.matrix(read.csv("genereads_ONLY4.txt", sep = '\t', row.names = 1, header = TRUE))
genes <- row.names(reads)
sum(lengths(regmatches(genes, gregexpr("\\AT[0-9]", genes, ignore.case = TRUE))))
#genes <- c("TRY", "AT2G46410", "AT5G41315", "AT2G42200", "AT1G10280")
IDconvert <- function(genes) {
for (i in genes){
if (grepl("AT[0-9]", i) == TRUE) {
if (is.na(getSYMBOL(i, data='org.At.tair.db')) == TRUE) {
if (is.na(getEG(i, data='org.At.tair')) == TRUE) {
i <- i
} else{
name <- getEG(i, data='org.At.tair')
name.l <- as.list(name)
newname <- as.character(name.l[[1]])
genes <- sub(i, newname, genes)
}
} else{
name <- getSYMBOL(i, data='org.At.tair')
name.l <- as.list(name)
newname <- as.character(name.l[[1]])
genes <- sub(i, newname, genes)
}
} else{
NULL
}
}
return(genes)
}
genes2 <- IDconvert(genes)
sum(lengths(regmatches(genes2, gregexpr("\\AT[0-9]", genes2, ignore.case = TRUE))))
row.names(reads) <- genes2
gmt <- read.gmt("GSEA_BIO.gmt")
gmt.ids <- read.gmt("IB_BIO_GMT.gmt")
gmt.combo <- c(gmt, gmt.ids)
#Stage 3 GO terms
names3 <- row.names(sub.break3)
sub.break3$names=names3
ranks <- sub.break3$stat
names(ranks) <- sub.break3$names
sub.break3.rank <- sort(ranks, decreasing = T)
fgseaRes3 <- fgsea(pathways = gmt.combo,
stats = sub.break3.rank,
minSize=5,
maxSize=500,
nperm=100000)
fgsea3.sig <- fgseaRes3[pval < 0.05]
pathways.stg3 <- fgsea3.sig$pathway
#Stage 1 GO terms
names1 <- row.names(sub.break1)
sub.break1$names=names1
ranks <- sub.break1$stat
names(ranks) <- sub.break1$names
sub.break1.rank <- sort(ranks, decreasing = T)
fgseaRes1 <- fgsea(pathways = gmt.combo,
stats = sub.break1.rank,
minSize=5,
maxSize=500,
nperm=100000)
fgsea1.sig <- fgseaRes1[pval < 0.05]
pathways.stg1 <- fgsea1.sig$pathway
#Stage 2 GO terms
names2 <- row.names(sub.break2)
sub.break2$names=names2
ranks <- sub.break2$stat
names(ranks) <- sub.break2$names
sub.break2.rank <- sort(ranks, decreasing = T)
fgseaRes2 <- fgsea(pathways = gmt.combo,
stats = sub.break2.rank,
minSize=5,
maxSize=500,
nperm=100000)
fgsea2.sig <- fgseaRes2[pval < 0.05]
pathways.stg2 <- fgsea2.sig$pathway
#Stage 4 GO terms
names4 <- row.names(sub.break4)
sub.break4$names=names4
ranks <- sub.break4$stat
names(ranks) <- sub.break4$names
sub.break4.rank <- sort(ranks, decreasing = T)
fgseaRes4 <- fgsea(pathways = gmt.combo,
stats = sub.break4.rank,
minSize=5,
maxSize=500,
nperm=100000)
fgsea4.sig <- fgseaRes4[pval < 0.05]
pathways.stg4 <- fgsea4.sig$pathway
#openxlsx::write.xlsx(fgsea4.sig, "fgsea_stg4_t1.xlsx")
#GO Venn-----------------------------------
group.venn(list(One = pathways.stg1,
Two = pathways.stg2,
Three = pathways.stg3,
Four = pathways.stg4),
fill = c("orange", "green", "red", "blue"))

how to optimise my code to run on a Windows 2012 Server

My code is running very slowly on my laptop and i have access to a windows 2012 server x64 with 256Gb ram.
I have the server set up running R and have this code working but 48 hours = 25%
From what i have learnt its due to only using one core.
Currently I'm exploring foreach loop but getting nowhere slowly
library("sp")
library("rgeos")
library("geosphere")
library("gdistance")
# Data
dna <- data.frame(cbind(rnorm(400) * 2 + 13, rnorm(400) + 48))
dna$ID <- seq.int(nrow(dna))
match <- data.frame(cbind(rnorm(4000) * 2 + 13, rnorm(4000) + 48))
match$ID <- seq.int(nrow(match))
##Set row id
RID2 <- 1
#create output table
tablelength <- print (nrow(dna))
match1 = data.frame( UPRN=rep(0, tablelength), Long=rep(0,tablelength), Lats=rep(0,tablelength), MatchID=rep(0,tablelength) , Longm=rep(0,tablelength), Latsm=rep(0,tablelength), distance=rep(0,tablelength))
#start loop
for(RID2 in dna[,3]) {
#Set UPRN and Exchange Name
Name <- paste(dna[RID2,3])
set1 <- data.frame(dna[RID2,1:2])
set2 <- data.frame(match[,1:2])
set1sp <- SpatialPoints(set2)
set2sp <- SpatialPoints(set1)
set1$ID <- apply(gDistance(set1sp, set2sp, byid=TRUE), 1, which.min)
ID <- paste(apply(gDistance(set1sp, set2sp, byid=TRUE), 1, which.min))
#insert Row
match1[RID2, ] = c(Name, set1[,1], set1[,2], paste(match[ID,3]), set2[ID,1], set2[ID,2], distVincentyEllipsoid(c(set1[,1], set1[,2]), c(set2[ID,1], set2[ID,2]), a=6378137, b=6356752.3142, f=1/298.257223563))
remove(set1,set2,set1sp,set2sp)
}
The output is what i am looking for but ideally with a sub 1 day runtime (currently at 8)
This works for me, and cuts calculation time (on your sample data) in half on my machine..
set.seed(123)
# Data
dna <- data.frame(cbind(rnorm(400) * 2 + 13, rnorm(400) + 48))
dna$ID <- seq.int(nrow(dna))
match <- data.frame(cbind(rnorm(4000) * 2 + 13, rnorm(4000) + 48))
match$ID <- seq.int(nrow(match))
###
library( sf )
library( data.table )
dna.sf <- st_as_sf( x = dna,
coords = c( "X1", "X2"),
crs = "+proj=longlat +datum=WGS84" )
match.sf <- st_as_sf( x = match,
coords = c( "X1", "X2"),
crs = "+proj=longlat +datum=WGS84" )
#create data.tables
setDT(dna)
setDT(match)
#add suffixes to identify columns later (after join)
setnames(dna, names(dna), paste0(names(dna),".dna"))
setnames(match, names(match), paste0(names(match),".match"))
#create distance matrix
m <- round( st_distance( dna.sf, match.sf ), digits = 0 )
colnames( m ) <- match.sf$ID
rownames( m ) <- dna.sf$ID
#get colname of min to nearest (remember, colname = match-ID ;-) )
dna$nearest <- apply( m, 1, which.min )
#get the min distance
dna$dist <- apply( m, 1, min )
#now left-join to get the coordinates of match, use data.table for speed
library( data.table )
result <- match[dna, on = c("ID.match==nearest") ]
The results seem to be the same as when using your 'old' method, but calculation time is roughly cut in half (7.5 -> 4 secs)
You can already get a good speed boost by simply optimizing the code and removing redundant parts. For example, this is more or less twice as fast on the test data, and is easily parallelizable.
library("sp")
library("rgeos")
library("geosphere")
library("gdistance")
# Data
dna <- data.frame(cbind(rnorm(400) * 2 + 13, rnorm(400) + 48))
dna$ID <- seq.int(nrow(dna))
match <- data.frame(cbind(rnorm(40000) * 2 + 13, rnorm(40000) + 48))
match$ID <- seq.int(nrow(match))
##Set row id
RID2 <- 1
#create output table
tablelength <- nrow(dna)
matchlist <- list()
set2 <- match[,1:2]
set1sp <- SpatialPoints(set2)
for(RID2 in dna[,3]) {
set1 <- dna[RID2,1:2]
set2sp <- SpatialPoints(set1)
ID <- which.min(gDistance(set1sp, set2sp, byid=TRUE))
#insert Row
matchlist[[RID2]] = data.frame(UPRN = dna[RID2,3],
Long = set1[,1],
Lats = set1[,2],
matchid = match[ID,3],
Longm = set2[ID,1],
Latsm = set2[ID,1],
distance = distVincentyEllipsoid(set1, set2[ID,],
a=6378137, b=6356752.3142, f=1/298.257223563))
}
match1 <- data.table::rbindlist(matchlist)
thanks you all for your input i will be reading the different styles to further my R learning's.
I have used a solution posted from the reddit thread i also made at the same time.
require(foreach)
require(doParallel)
cl <- makeCluster(4)
registerDoParallel(cl)
temp <- foreach(I = 1:nrow(dna),.combine = "c", .packages = c("rgeos","sp")) %dopar% {
return(c(which.min(
gDistance(
SpatialPoints(data.frame(dna[I,1:2]))
, SpatialPoints(data.frame(match[,1:2]))
, byid=TRUE
))))
}
https://old.reddit.com/r/rstats/comments/aebamb/how_do_i_use_all_the_cores_on_a_server_to_match/
Again thank you for the help :-D

Function in R to pull weather data based on Lat/Lon - RNOAA package

I am building a list with types of weather observations in R based on Latitude and Longitude that is similar to the weather station.
## List of airports you want to include in your weather extract
airport_list <- c("KABE" , "KBWI", "KRAL")
## Drilldown of your airport locations ( I have a separate table that pulls in this info
airport_list_dd <- airport_locs[airport_locs$icao %in% airport_list,]
## Mutate the data frame to make lat/lon compatible for searching the NOAA GHCND
airport_list_similar <- airport_list_dd %>%
mutate(lon_similar = str_extract(longitude, "([-0-9]+)\\.."),
lat_similar = str_extract(latitude, "([-0-9]+)\\.."),
lon_exact = str_extract(longitude, "([-0-9]+)\\....."),
lat_exact = str_extract(latitude, "([-0-9]+)\\....."))
## Define your date range
date_min <- Sys.Date() - (10 * 365)
date_max <- Sys.Date()
filter_year <- year(Sys.Date()) - 1
# THIS IS WHERE I AM HAVING THE ISSUE
## Build your weather extracts
ghcnd_near_airport <- list()
build_lon_table <- function(x){
i <- 1
for (i in 1:length(x)) {
lon_similar <- x$lon_similar[i]
lat_similar <- x$lat_similar[i]
ghcnd_near_airport <- c(ghcnd_stations %>%
filter(str_detect(longitude, lon_similar), str_detect(latitude, lat_similar)), list(i))
}
return(ghcnd_near_airport)
}
but this returns an empty list with a count of 11, which means that it is iterating through the function the appropriate amount of time, but is not returning any data in the list.
FOUND THE ANSWER:
build_lon_table <- function(x){
i <- 1
ghcnd_near_airport_app <- data.frame(id = character(), latitude = numeric(), longitude = numeric(), elevation = numeric(), state = character(), name = character(), gsn_flag = character(), wmo_id = character(), element = character(),first_year = integer(), last_year = integer(), stringsAsFactors = FALSE)
for (i in 1:nrow(x)) {
ghcnd_near_airport_filter <- ghcnd_stations %>%
filter(str_detect(longitude, x$lon_similar[i]), str_detect(latitude, x$lat_similar[i]), last_year >= filter_year, element == "TMIN"| element == "WT01") %>%
mutate(lon_diff = abs(longitude - as.numeric(airport_list_similar$lon_exact[i])), lat_diff = abs(latitude - as.numeric(airport_list_similar$lat_exact[i])), total_diff = lon_diff + lat_diff) %>%
arrange(total_diff)
ghcnd_near_airport_filter <- head(ghcnd_near_airport_filter, 2)
ghcnd_near_airport_app <- rbind(ghcnd_near_airport_app, ghcnd_near_airport_filter)
i <- i + 1
}
return(ghcnd_near_airport_app)
}
test <- build_lon_table(airport_list_similar)

Looking up tickers for different time periods in a loop with quantmod

I'm able to loop through and calculate the overnight/over-weekend returns for a list of tickers when the time period is the same for every ticker, but am having trouble when the time period I want to look up is different for each ticker.
For example, with:
symbols <- c("AAPL", "GOOG"," MSFT")
dates <- as.Date(c("2015-01-04", "2015-01-05", "2015-01-06"))
example.df <- data.frame(tickers, dates)
example.df
tickers dates
1 AAPL 2015-01-04
2 GOOG 2015-01-05
3 MSFT 2015-01-06
I'd want the overnight return for AAPL between 2015-01-04 and 2015-01-05, for GOOG between 2015-01-05 and 2015-01-06, etc. If it was a Friday, I'd want the next Monday.
I can can get what I'm looking for by looking up each individual ticker like this:
library(quantmod)
library(dplyr)
# date range accounts for weekends
getSymbols("AAPL", from = "2016-01-04", to = "2016-01-08")
data <- as.data.frame(AAPL)
colnames(data) <- c("open","high","low","close","volume","adj.")
# overnight return calculation
data$overnight.return <- data$open / lag(data$close, default = 0) - 1
data$overnight.return <- paste(round(data$overnight.return * 100, 3), "%",sep= "")
# the overnight/over-weekend returns for the specified date
data.df.final <- slice(data, 2)
Of course that's terribly slow.
Here's as far as I was able to get trying to make a loop out of it:
# needs to be a loop itself and inside the other 'for' loop somehow I think
symbol.list <- example.df[,1]
start <- data[,2]
end <- data[,2] + days(3)
results <- NULL
for (i in symbol.list) {
data <- getSymbols(Symbols = i,
src = "yahoo",
from = start, to = end,
auto.assign = FALSE)
if (inherits(data, 'try-error')) next
colnames(data) <- c("open","high","low","close","volume","adj.")
data <- as.data.frame(data)
data <- cbind(date = rownames(data), data)
data$overnightRtn <- as.vector(data$open / lag(data$close, default = 0) - 1)
data$overnightRtn <- paste(round(data$overnightRtn * 100, 3), "%")
data <- slice(data, 2)
results <- bind_rows(results, data)
}
How can I add the date looping aspect to the above ticker loop?
maybe this is what you are looking for. See that I'm using an index, not the actual list, so I can refer to every element of your data frame (it is not optimized, but it is doing the job you described in the function):
symbols <- c("AAPL", "GOOG"," MSFT") ## " MSFT" has an extra space
dates <- as.Date(c("2015-01-04", "2015-01-05", "2015-01-06"))
example.df <- data.frame(tickers=symbols, dates) ## there was an error here in your example.
symbol.list <- trimws(example.df[,1])
start <- as.Date(example.df[,2])
end <- as.Date(example.df[,2]) + days(3)
results <- NULL
for (i in 1:NROW(symbol.list)) {
try(dataX <- getSymbols(Symbols = symbol.list[i],
src = "yahoo",
from = start[i], to = end[i],
auto.assign = FALSE),silent=T)
if (!exists("dataX")) {cat("Error in ",i,"\n");next}
colnames(dataX) <- c("open","high","low","close","volume","adj.")
dataX <- as.data.frame(dataX)
dataX <- cbind(date = rownames(dataX), dataX)
dataX$overnightRtn <- as.vector(dataX$open / lag(dataX$close, default = 0) - 1)
dataX$overnightRtn <- paste(round(dataX$overnightRtn * 100, 3), "%")
data2 <- slice(dataX, 2);rm(dataX)
results <- if (is.null(results)) data2 else rbind(results, data2)
}

Resources