I need to read and create an dataframe with R from this url https://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-extended-latest, but I confess that I cannot go much far than this...
# R packages
library(tidyverse)
library(dplyr)
library(rvest)
and...
url <- "https://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-extended-latest"
page<- read_html(url)
page
{html_document}
<html>
[1] <body><p>2.3|lacnic|20220922|84615|19870101|20220922|-0300\nlacnic|*|ipv4 ...
I tryed to use rvest to find tables but...
tables <- page %>%
html_table(fill=TRUE)
tables
list()
My expected dataframe result is something like
In other words, using the | as sep ... How can I extract this data and convert it for an R dataframe ?
Try this:
url <- "https://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-extended-latest"
df <- read.delim(url(url), sep = "|", row.names = NULL)
df <- readr::read_delim(
file = "https://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-extended-latest",
delim = "|",
col_names = F
)
Related
I am very new to R and trying to setup some automation. I have some 10-20 json files in a folder, I want to run the R script for each json file so that I can extract data from each json file and keep appending the extracted data in a one dataframe.
In below code df is the dataframe that will store data.
In my case I was able to extract data from one json file and stored that data in df. How do I do this for all the json file and store the extracted data by appending df?
json_file <- "path_to_file/file.json"
json_data <- fromJSON(json_file)
df <- data.frame(str_split(json_data$data$summary$bullet, pattern = " - ")) %>%
row_to_names(row_number = 1)
My output should be a dataframe that contains all the extracted data from each file in a sequence.
I would really appreciate any help.
Something like the following might do what the question asks for. Untested, since there are no data.
The JSON processing package is just a guess, there are alternatives on CRAN. Change the call to library() at will.
library(jsonlite)
read_and_process_json <- function(x, path) {
json_file <- file.path(path, x)
json_data <- fromJSON(json_file)
json_bullet <- stringr::str_split(json_data$data$summary$bullet, pattern = " - ")
data.frame(json_bullet) |>
janitor::row_to_names(row_number = 1)
}
base_path <- "path_to_file"
json_files <- list.files(path = base_path, pattern = "\\.json$")
df_list <- lapply(json_files, read_and_process_json, path = base_path)
df_all <- do.call(rbind, df_list)
I am newbie at using R and here's my attempt to play a round a code to scrape quotes from multiple pages
# Load Libraries
library(rvest) # To Scrape
library(tidyverse) # To Manipulate Data
# Scrape Multiple Pages
for (i in 1:4){
site_to_scrape <- read_html(paste0("http://quotes.toscrape.com/page/",i))
temp <- site_to_scrape html_nodes(".text") html_text()
content <- append(content, temp)
}
#Export Results To CSV File
write.csv(content, file = "content.csv", row.names = FALSE)
I have encountered an error Object not found as for content variable. How can I overcome this error and set the object so as to be reusable in the append line?
Growing vector in a loop is very inefficient if you are scraping many pages. Instead what you should do is initialise a list with specific length which you know beforehand.
library(rvest)
n <- 4
content = vector('list', n)
# Scrape Multiple Pages
for (i in 1:n){
site_to_scrape <- read_html(paste0("http://quotes.toscrape.com/page/",i))
content[[i]] <- site_to_scrape %>%
html_nodes(".text") %>%
html_text()
}
write.csv(unlist(content), file = "content.csv", row.names = FALSE)
Another option without initialising is to use sapply/lapply :
all_urls <- paste0("http://quotes.toscrape.com/page/",1:4)
content <- unlist(lapply(all_urls, function(x)
x %>% read_html %>% html_nodes(".text") %>% html_text()))
I have searched and found the way to assign empty object before the loop content = c()
# Load Libraries
library(rvest) # To Scrape
library(tidyverse) # To Manipulate Data
content = c()
# Scrape Multiple Pages
for (i in 1:4){
site_to_scrape <- read_html(paste0("http://quotes.toscrape.com/page/",i))
temp <- site_to_scrape %>%
html_nodes(".text") %>%
html_text()
content <- append(content, temp)
}
#Export Results To CSV File
write.csv(content, file = "content.csv", row.names = FALSE)
I managed to do a scraper for gathering Election info in R(rvest), but now I am struggling with how I can save the data not in separate CSV files, but in the one CSV file.
Here is my working code where I can scrap pages 11,12,13 separately.
library(rvest)
library(xml2)
do.call(rbind, lapply(11:13,
function(n) {
url <- paste0("http://www.cvk.gov.ua/pls/vnd2014/WP040?PT001F01=910&pf7331=", n)
mi <- read_html(url)%>% html_table(fill = TRUE)
mi[[8]]
file <- paste0("election2014_", n, ".csv")
if (!file.exists(file)) write.csv(mi[[8]], file)
Sys.sleep(5)
}))
I tried to do this in the end, but it is not working as I expected
write.csv(rbind(mi[[8]],url), file="election2014.csv")
try this one :
library(rvest)
library(tidyverse)
scr<-function(n){
url<-paste0("http://www.cvk.gov.ua/pls/vnd2014/WP040?PT001F01=910&pf7331=",n)
df=read_html(url)%>%
html_table(fill = TRUE)%>%
.[[8]]%>%
data.frame()
colnames(df)<-df[1,]
df<-df[-1,]
}
res<-11:13%>%
map_df(.,scr)
write.csv2(res,"odin_tyr.csv")
I wasn't able to get your code to work, but you could try creating an empty data frame before running you code, and then do this before writing a csv file with the complete data:
df = rbind(df,mi[[8]])
you could also consider turning your csv files into one using the purrr package:
files = list.files("folder_name",pattern="*.csv",full.names = T)
df = files %>%
map(read_csv) %>%
reduce(rbind)
I'm currently trying to create a function that will read many pdf files into a data frame. My ultimate goal is to have it read specific information from the pdf files and convert them into a data.frame with insurance plan names in each row and the columns comprising of information I need such as individual plan price, family plan prices, etc. I have been following an answer given by someone for a similar question in the past. However, I keep getting an error. Here is a link to two different files I am practicing on(1 and 2).
Here are my code and error below:
PDFtoDF = function(file) {
dat = readPDF(control=list(text="-layout"))(elem=list(uri=file),
language="en", id="id1")
dat = c(as.character(dat))
dat = gsub("^ ?([0-9]{1,3}) ?", "\\1|", dat)
dat = gsub("(, HVOL )","\\1 ", dat)
dat = gsub(" {2,100}", "|", dat)
excludeRows = lapply(gregexpr("\\|", dat), function(x) length(x)) != 6
write(dat[excludeRows], "rowsToCheck.txt", append=TRUE)
dat = dat[!excludeRows]
dat = read.table(text=dat, sep="", quote="", stringsAsFactors=FALSE)
names(dat) = c("Plan", "Individual", "Family")
return(dat)
}
files <- list.files(pattern = "pdf$")
df = do.call("rbind", lapply(files, PDFtoDF))
Error in read.table(text = dat, sep = "", quote = "", stringsAsFactors =
FALSE) : no lines available in input
Before this approach, I have been using the pdftools package and regular expressions. This approach worked except it was difficult to clarify a pattern for some parts of the document such as the plan name which is at the top. I was hoping the methodology I'm trying now will help since it will extract the text into separate strings for me.
Here's the best answer:
require(readtext)
df <- readtext("*.pdf")
Yes it's that simple, with the readtext package!
I'm trying to import the list of nuclear test sites (from Wikipedia's page) in a data.frame using the code below:
library(RCurl)
library(XML)
theurl <- "https://en.wikipedia.org/wiki/List_of_nuclear_test_sites"
webpage <- getURL(theurl)
webpage <- readLines(tc <- textConnection(webpage)); close(tc)
pagetree <- htmlTreeParse(webpage, error=function(...){}, useInternalNodes = TRUE)
# Find XPath (go the webpage, right-click inspect element, find table then right-click copyXPath)
myxpath <- "//*[#id='mw-content-text']/table[2]"
# Extract table header and contents
tablehead <- xpathSApply(pagetree, paste(myxpath,"/tr/th",sep=""), xmlValue)
results <- xpathSApply(pagetree, paste(myxpath,"/tr/td",sep=""), xmlValue)
# Convert character vector to dataframe
content <- as.data.frame(matrix(results, ncol = 5, byrow = TRUE))
names(content) <- c("Testing country", "Location", "Site", "Coordinates", "Notes")
However there are multiple sub-headers that prevent the data.frame to be populated consistently. How can I fix this?
Take a look at the htmltab package. It allows you to use the subheaders for populating a new column:
library(htmltab)
tab <- htmltab("https://en.wikipedia.org/wiki/List_of_nuclear_test_sites",
which = "/html/body/div[3]/div[3]/div[4]/table[2]",
header = 1 + "//tr/th[#style='background:#efefff;']",
rm_nodata_cols = F)
I found this example by Carson Sievert that worked well for me:
library(rvest)
theurl <- "https://en.wikipedia.org/wiki/List_of_nuclear_test_sites"
# First, grab the page source
content <- html(theurl) %>%
# then extract the first node with class of wikitable
html_node(".wikitable") %>%
# then convert the HTML table into a data frame
html_table()
Have you tried this?
l.wiki.url <- getURL( url = "https://en.wikipedia.org/wiki/List_of_nuclear_test_sites" )
l.wiki.par <- htmlParse( file = l.wiki.url )
l.tab.con <- xpathSApply( doc = l.wiki.par
, path = "//table[#class='wikitable']//tr//td"
, fun = xmlValue
)