I designed the following scripts months ago and it worked without any issue. The last few days I tried to rerun the same script but always got the same error. I've changed my script and updating the packages, but i'm unable to make it work again. The script should give me all the delays in the Belgian railroad stations.
I have add my two separate scripts (one is filled with the functions) and the error/traceback.
library(httr)
library(jsonlite)
library(tidyverse)
load.stations <- function(){
a <- GET("https://api.irail.be/stations/?format=json") #get command for all stations from irail api
parsed <- jsonlite::fromJSON(content(a, "text"), flatten=TRUE) #parse json into r
stations <- parsed$station %>%
filter(grepl("^BE.NMBS.0088",id)) #keep only stations in Belgium. Regular expression ^ is begins with
return(stations)
}
get.time <- function(){
time <- paste(format(Sys.time(),"%d/%m/%y %H:%M:%S")) #formats system time in dd/mm/yyyy hh:mm:ss in a string
strpt <- strptime(time,"%d/%m/%y %H:%M:%S") #takes time-string and converts to interpretable date and time
return(strpt)
}
get.temp_df <- function(stations, i){
goget <- paste0("https://api.irail.be/liveboard/?format=json&id=",stations$id[i]) #http for get command, get liveboard (similar to screens in station i)
c <- GET(goget) #get the data
parsed_c <- jsonlite::fromJSON(content(c, "text"), flatten=TRUE) #parse from json
temp_df <- parsed_c$departures$departure #get the dataframe with departures from the parsed json
return(temp_df)
}
add.to.all <- function(all_df, temp_df){
all_df <- rbind(all_df,temp_df)%>% #add temporary dataframe to master dataframe
group_by(stationneke,time,vehicle)%>% # group departure times by station - remove doubles
top_n(1,importtime)%>% #only keep the most recent observation - remove doubles 2
ungroup() #lift grouping
return(all_df)
}
save.day <- function(all_df){
strpt <- get.time()
saveRDS(all_df,file = paste(strpt$mday, strpt$mon+1, strpt$year+1900,"Punct.rda",sep = "-"))
Sys.sleep(time = 3600-(strpt$min*60+strpt$sec)) #sleep one hour minus number of secs in the sleep time
return(data.frame())
}
library(httr)
library(jsonlite)
library(tidyverse)
## all departures - scraper
loop.scraper <- function(hour_of_pause =3){
source("NMBS-punctuality-functions.R")
all_df <- data.frame() #leeg dataframe
stations <- load.stations()
while (TRUE) { #infinite loop
strpt <- get.time()
while(strpt$hour != hour_of_pause){ #enters loop when hour is not "hour_of_pause"
# startloop <- (strpt$min*60 + strpt$sec)
for (i in 1:nrow(stations)) { #second loop through the stations
temp_df <- get.temp_df(stations, i)
if(is.null(temp_df)) next #skip if dataframe is empty (some stations have been closed in recent years)
temp_df$stationneke <- stations$name[i] #add departure station name i to the dataframe
temp_df$importtime <- Sys.time() # add variable with the time of import of the observation
all_df <- add.to.all(all_df, temp_df)
strpt <- get.time()
} #end of loop through stations
# stoploop <- (strpt$min*60 + strpt$sec)
} #end of hour-check loop, code below only executed when no trains active (at night)
all_df <- save.day(all_df) #saves file and returns empty dataframe
}
}
Error: lexical error: invalid char in json text.
<br /> <b>Fatal error</b>: Unc
(right here) ------^
5.
parse_string(txt, bigint_as_char)
4.
parseJSON(txt, bigint_as_char)
3.
parse_and_simplify(txt = txt, simplifyVector = simplifyVector,
simplifyDataFrame = simplifyDataFrame, simplifyMatrix = simplifyMatrix,
flatten = flatten, ...)
2.
jsonlite::fromJSON(content(c, "text"), flatten = TRUE)
1.
loop.scraper(12)
Related
PROJECT - collecting topic information from Australian High Court decisions published online to analyse trends over time.
CURRENT PROBLEM - trying to iterate 'case_importer' function (which returns relevant character strings from decisions) over each year a number of times equal to the number of cases decided in that year.
I can achieve this manually by using a for loop which iterates as many times as there were cases each year - using 2020 (which has had 14 cases so far) for example:
for (i in 14) {
hc_cases_2020_list[[i]] <- case_importer(2020, i)
}
However, I would rather not have to rewrite the number of iterations for each year manually.
I have stored the number of cases per year in an excel sheet called 'hc_cases_per_year' which has two columns, 'year' and 'num_cases', and created a function which gives the 'num_cases' value with a year input.
Any advice in solving the specific problem or taking a better approach would be beyond appreciated.
My code so far is below - thanks for taking the time to read!
library(rvest)
library(dplyr)
library(readxl)
# Function to import html data and turn it into text
html_to_text <- function(url) {
url %>%
read_html %>%
html_nodes("li , p , blockquote , b , h2") %>%
html_text
}
# Function to extract topic paragraphs using grep
grep_topics <- function(text) {
grep("^\\n?(:?\\w+\\s)+\\(?C?t?h?\\)?\\s?–.*", text, perl = TRUE, value = TRUE)
}
# Function to wrap the above funtions into a single function that takes year and case number
case_importer <- function(year, case_num) {
url <- url_maker(year, case_num)
text <- html_to_text(url)
grep_topics(text)
}```
# Function + command to create empty list for each year
list_namer <- function(year) {
assign(paste0("hc_cases_", year, "_list", sep = ""), list(), envir = .GlobalEnv)
}
for (i in 1948:2020) {
list_namer(i)
}
# Function to change list to dataframe
list_to_df <- function(list) {
num_obs <- sapply(list, length)
seq_max <- seq_len(max(num_obs))
t(sapply(list, "[", i = seq_max))
}
# Function to return number of cases in given year
cases_in_year <- function(year) {
hc_cases_per_year$num_cases[year - 1947]
}
# Function to create a URL for any given year and case number
url_maker <- function(year, case_num) {
paste("http://classic.austlii.edu.au/au/cases/cth/HCA/", year, "/", case_num, ".html", sep = "")
}
# Importing hc_cases_per_year
hc_cases_per_year <- read_excel("PATH/hc_cases_per_year.xlsx")
I am trying to run a code which focuses on
Getting Stock Data
Sorting and cleaning data
Analyzing data
However I keep getting errors, where the first error that comes up is
Error: $ operator is invalid for atomic vectors
The code is the following:
library(quantmod)
library(reshape)
library(ggplot2)
##Choose stock tickers to grab data and other parameters
ticker <- c("MSFT","GOOG","AMZN","IBM")
ExpectedGrowthRate <- -1
MarginSafety <- .10
GrowthDeclineRate <- .05
DiscountRate <- .09
DCF_YearsProjection <- 5
LongTermGrowthRate <- .03
##Get the financial statement data of the tickers for the past four years
data <- data.frame()
for (i in 1:length(ticker)){
#Get the stock ticker information
temp_data <- try(getFinancials(ticker[i], type=c('BS','IS','CF'), period=c('A'), auto.assign = FALSE), silent=TRUE)
#Create temporary data frames for each of the financial statments
temp_dataBS <- data.frame(temp_data$BS$A, FinancialStatement="BalanceSheet")
temp_dataBS$Metric <- rownames(temp_dataBS)
temp_dataIS <- data.frame(temp_data$IS$A, FinancialStatement="IncomeStatement")
temp_dataIS$Metric <- rownames(temp_dataIS)
temp_dataCF <- data.frame(temp_data$CF$A, FinancialStatement="CashFlow")
temp_dataCF$Metric <- rownames(temp_dataCF)
temp_data <- rbind(melt(temp_dataBS, id=c("Metric", "FinancialStatement")), melt(temp_dataIS, id=c("Metric", "FinancialStatement")), melt(temp_dataCF, id=c("Metric", "FinancialStatement")))
#Rbind the data to append ticker information
data <- rbind(data, cbind(temp_data, "Ticker"=ticker[i]))
}
In what way is this fixable?
EDIT: it seems that the problem lies in the function GetFinancials, as when I run the following code:
temp_data <- getFinancials("IBM", type=c('BS','IS','CF'), period=c('A'), auto.assign = FALSE)
I get the following error:
Error in thead[x]:thead[x + 1] : NA/NaN argument
I'm trying to pull some data from an API throw it all into a single data frame. I'm trying to put a variable into the URL I'm pulling from and then loop it to pull data from 54 keys. Here's what I have so far with notes.
library("jsonlite")
library("httr")
library("lubridate")
options(stringsAsFactors = FALSE)
url <- "http://api.kuroganehammer.com"
### This gets me a list of 58 observations, I want to use this list to
### pull data for each using an API
raw.characters <- GET(url = url, path = "api/characters")
## Convert the results from unicode to a JSON
text.raw.characters <- rawToChar(raw.characters$content)
## Convert the JSON into an R object. Check the class of the object after
## it's retrieved and reformat appropriately
characters <- fromJSON(text.raw.characters)
class(characters)
## This pulls data for an individual character. I want to get one of
## these for all 58 characters by looping this and replacing the 1 in the
## URL path for every number through 58.
raw.bayonetta <- GET(url = url, path = "api/characters/1/detailedmoves")
text.raw.bayonetta <- rawToChar(raw.bayonetta$content)
bayonetta <- fromJSON(text.raw.bayonetta)
## This is the function I tried to create, but I get a lexical error when
## I call it, and I have no idea how to loop it.
move.pull <- function(x) {
char.x <- x
raw.x <- GET(url = url, path = cat("api/characters/",char.x,"/detailedmoves", sep = ""))
text.raw.x <- rawToChar(raw.x$content)
char.moves.x <- fromJSON(text.raw.x)
char.moves.x$id <- x
return(char.moves.x)
}
The first part of this:
library(jsonlite)
library(httr)
library(lubridate)
library(tidyverse)
base_url <- "http://api.kuroganehammer.com"
res <- GET(url = base_url, path = "api/characters")
content(res, as="text", encoding="UTF-8") %>%
fromJSON(flatten=TRUE) %>%
as_tibble() -> chars
Gets you a data frame of the characters.
This:
pb <- progress_estimated(length(chars$id))
map_df(chars$id, ~{
pb$tick()$print()
Sys.sleep(sample(seq(0.5, 2.5, 0.5), 1)) # be kind to the free API
res <- GET(url = base_url, path = sprintf("api/characters/%s/detailedmoves", .x))
content(res, as="text", encoding="UTF-8") %>%
fromJSON(flatten=TRUE) %>%
as_tibble()
}, .id = "id") -> moves
Gets you a data frame of all the "moves" and adds the "id" for the character. You get a progress bar for free, too.
You can then either left_join() as needed or group & nest the moves data into a separate list-nest column. If you want that to begin with you can use map() vs map_df().
Leave in the time pause code. It's a free API and you should likely increase the pause times to avoid DoS'ing their site.
How can I ignore a data set if some column names don't exist in it?
I have a list of weather data from a stream but I think certain key weather conditions don't exist and therefore I have this error below with rbind:
Error in rbind(deparse.level, ...) :
numbers of columns of arguments do not match
My code:
weatherDf <- data.frame()
for(i in weatherData) {
# Get the airport code.
airport <- i$airport
# Get the date.
date <- as.POSIXct(as.numeric(as.character(i$timestamp))/1000, origin="1970-01-01", tz="UTC-1")
# Get the data in dailysummary only.
dailySummary <- i$dailysummary
weatherDf <- rbind(weatherDf, ldply(
list(dailySummary),
function(x) c(airport, format(as.Date(date), "%Y-%m-%d"), x[["meanwindspdi"]], x[["meanwdird"]], x[["meantempm"]], x[["humidity"]])
))
}
So how can I make sure these key conditions below exist in the data:
meanwindspdi
meanwdird
meantempm
humidity
If any of them does not exit, then ignore the bunch of them. Is it possible?
EDIT:
The content of weatherData is in jsfiddle (I can't post it here as it is too long and I dunno where is the best place to show the data publicly for R...)
EDIT 2:
I get some error when I try to export the data into a txt:
> write.table(weatherData,"/home/teelou/Desktop/data/data.txt",sep="\t",row.names=FALSE)
Error in data.frame(date = list(pretty = "January 1, 1970", year = "1970", :
arguments imply differing number of rows: 1, 0
What does it mean? It seems that there are some errors in the data...
EDIT 3:
I have exported my entire data in .RData to my google drive:
https://drive.google.com/file/d/0B_w5RSQMxtRSbjdQYWJMX3pfWXM/view?usp=sharing
If you use RStudio, then you can just import the data.
EDIT 4:
target_names <- c("meanwindspdi", "meanwdird", "meantempm", "humidity")
# If it has data then loop it.
if (!is.null(weatherData)) {
# Initialize a data frame.
weatherDf <- data.frame()
for(i in weatherData) {
if (!all(target_names %in% names(i)))
next
# Get the airport code.
airport <- i$airport
# Get the date.
date <- as.POSIXct(as.numeric(as.character(i$timestamp))/1000, origin="1970-01-01", tz="UTC-1")
# Get the data in dailysummary only.
dailySummary <- i$dailysummary
weatherDf <- rbind(weatherDf, ldply(
list(dailySummary),
function(x) c(airport, format(as.Date(date), "%Y-%m-%d"), x[["meanwindspdi"]], x[["meanwdird"]], x[["meantempm"]], x[["humidity"]])
))
}
# Rename column names.
colnames(weatherDf) <- c("airport", "key_date", "ws", "wd", "tempi", 'humidity')
# Convert certain columns weatherDf type to numberic.
columns <-c("ws", "wd", "tempi", "humidity")
weatherDf[, columns] <- lapply(columns, function(x) as.numeric(weatherDf[[x]]))
}
Inspect the weatherDf:
> View(weatherDf)
Error in .subset2(x, i, exact = exact) : subscript out of bounds
You can use next to skip the current iteration of the loop and go to the next iteration:
target_names <- c("meanwindspdi", "meanwdird", "meantempm", "humidity")
for(i in weatherData) {
if (!all(target_names %in% names(i)))
next
# continue with loop...
I'm looking for some assistance in writing some R code to iterate through rows in a dataframe and pass the values in each row to a function and print the output either to an excel file, txt file or just in the console.
The purpose of this is to automate a bunch of distance/time queries (several hundred) to google maps using the function found at this website: http://www.nfactorialanalytics.com/r-vignette-for-the-week-finding-time-distance-between-two-places/
The function on that website is as follows:
library(XML)
library(RCurl)
distance2Points <- function(origin,destination){
results <- list();
xml.url <- paste0('http://maps.googleapis.com/maps/api/distancematrix/xml?origins=',origin,'&destinations=',destination,'&mode=driving&sensor=false')
xmlfile <- xmlParse(getURL(xml.url))
dist <- xmlValue(xmlChildren(xpathApply(xmlfile,"//distance")[[1]])$value)
time <- xmlValue(xmlChildren(xpathApply(xmlfile,"//duration")[[1]])$value)
distance <- as.numeric(sub(" km","",dist))
time <- as.numeric(time)/60
distance <- distance/1000
results[['time']] <- time
results[['dist']] <- distance
return(results)
}
The dataframe will contain two columns: origin postal code and destination postal code (Canada, eh?). I'm a beginner R programmer, so I know how to use read.table to load a txt file into a dataframe. I'm just not sure how iterate through the dataframe, each time passing values to the distance2Points function and executing. I think this can be done using either a for loop or one of the apply calls?
Thanks for the help!
edit:
To keep it simple lets assume I want to transform these two vectors into a dataframe
> a <- c("L5B4P2","L5B4P2")
> b <- c("M5E1E5", "A2N1T3")
> postcodetest <- data.frame(a,b)
> postcodetest
a b
1 L5B4P2 M5E1E5
2 L5B4P2 A2N1T3
How should I go about iterating over these two rows to return both distances and times from the distance2Points function?
Here's one way to do it, using lapply to produce a list with the results for each row in your data and using Reduce(rbind, [yourlist]) to concatenate that list into a data frame whose rows correspond to the ones in your original. To make this work, we also have to tweak the code in the original function to return a one-row data frame, so I've done that here.
distance2Points <- function(origin,destination){
require(XML)
require(RCurl)
xml.url <- paste0('http://maps.googleapis.com/maps/api/distancematrix/xml?origins=',origin,'&destinations=',destination,'&mode=driving&sensor=false')
xmlfile <- xmlParse(getURL(xml.url))
dist <- xmlValue(xmlChildren(xpathApply(xmlfile,"//distance")[[1]])$value)
time <- xmlValue(xmlChildren(xpathApply(xmlfile,"//duration")[[1]])$value)
distance <- as.numeric(sub(" km","",dist))
time <- as.numeric(time)/60
distance <- distance/1000
# this gives you a one-row data frame instead of a list, b/c it's easy to rbind
results <- data.frame(time = time, distance = distance)
return(results)
}
# now apply that function rowwise to your data, using lapply, and roll the results
# into a single data frame using Reduce(rbind)
results <- Reduce(rbind, lapply(seq(nrow(postcodetest)), function(i)
distance2Points(postcodetest$a[i], postcodetest$b[i])))
Result when applied to your sample data:
> results
time distance
1 27.06667 27.062
2 1797.80000 2369.311
If you would prefer to do this without creating a new object, you could also write separate functions for computing time and distance -- or a single function with those outputs as options -- and then use sapply or just mutate to create new columns in your original data frame. Here's how that might look using sapply:
distance2Points <- function(origin, destination, output){
require(XML)
require(RCurl)
xml.url <- paste0('http://maps.googleapis.com/maps/api/distancematrix/xml?origins=',
origin, '&destinations=', destination, '&mode=driving&sensor=false')
xmlfile <- xmlParse(getURL(xml.url))
if(output == "distance") {
y <- xmlValue(xmlChildren(xpathApply(xmlfile,"//distance")[[1]])$value)
y <- as.numeric(sub(" km", "", y))/1000
} else if(output == "time") {
y <- xmlValue(xmlChildren(xpathApply(xmlfile,"//duration")[[1]])$value)
y <- as.numeric(y)/60
} else {
y <- NA
}
return(y)
}
postcodetest$distance <- sapply(seq(nrow(postcodetest)), function(i)
distance2Points(postcodetest$a[i], postcodetest$b[i], "distance"))
postcodetest$time <- sapply(seq(nrow(postcodetest)), function(i)
distance2Points(postcodetest$a[i], postcodetest$b[i], "time"))
And here's how you could do it in a dplyr pipe with mutate:
library(dplyr)
postcodetest <- postcodetest %>%
mutate(distance = sapply(seq(nrow(postcodetest)), function(i)
distance2Points(a[i], b[i], "distance")),
time = sapply(seq(nrow(postcodetest)), function(i)
distance2Points(a[i], b[i], "time")))