Download multiple .csv from url in R - r

I have multiple .csv datasets and I want to upload them and save them with different names. Many thanks in advance.
casualty_2005 <- read.csv("https://tfl.gov.uk/cdn/static/cms/documents/2005-gla-data-extract-casualty.csv", header=T)
casualty_2006 <- read.csv("https://tfl.gov.uk/cdn/static/cms/documents/2006-gla-data-extract-casualty.csv", header=T)
casualty_2007 <- read.csv("https://tfl.gov.uk/cdn/static/cms/documents/2007-gla-data-extract-casualty.csv", header=T)
casualty_2008 <- read.csv("https://tfl.gov.uk/cdn/static/cms/documents/2008-gla-data-extract-casualty.csv", header=T)
casualty_2009 <- read.csv("https://tfl.gov.uk/cdn/static/cms/documents/2009-gla-data-extract-casualty.csv", header=T)
casualty_2010 <- read.csv("https://tfl.gov.uk/cdn/static/cms/documents/2010-gla-data-extract-casualty.csv", header=T)
casualty_2011 <- read.csv("https://tfl.gov.uk/cdn/static/cms/documents/2011-gla-data-extract-casualty.csv", header=T)
casualty_2012 <- read.csv("https://tfl.gov.uk/cdn/static/cms/documents/2012-gla-data-extract-casualty.csv", header=T)
casualty_2013 <- read.csv("https://tfl.gov.uk/cdn/static/cms/documents/2013-gla-data-extract-casualty.csv", header=T)
casualty_2014 <- read.csv("https://tfl.gov.uk/cdn/static/cms/documents/2014-gla-data-extract-casualty.csv", header=T)

Similar questions have been asked many times, create a vector of names/links, then read in all files in a lapply loop.
Note that read.csv's default is header = TRUE.
url_fmt <- "https://tfl.gov.uk/cdn/static/cms/documents/%04d-gla-data-extract-casualty.csv"
url_years <- 2005:2014
url_vec <- sprintf(url_fmt, url_years)
df_list <- lapply(url_vec, read.csv)
names(df_list) <- url_years
head(df_list[[1]]) # first file, top 6 rows
head(df_list[["2005"]]) # same file
head(df_list$`2005`) # same file
Edit
After reading doctshind s answer, I realized that the question is asking how to download the files, not how to read them.
The instructions on setting a new directory are optional.
#old_dir <- getwd()
#setwd('~/tmp')
lapply(url_vec, function(x) download.file(x, destfile = basename(x)))
list.files(pattern = '\\.csv')
# [1] "2005-gla-data-extract-casualty.csv"
# [2] "2006-gla-data-extract-casualty.csv"
# [3] "2007-gla-data-extract-casualty.csv"
# [4] "2008-gla-data-extract-casualty.csv"
# [5] "2009-gla-data-extract-casualty.csv"
# [6] "2010-gla-data-extract-casualty.csv"
# [7] "2011-gla-data-extract-casualty.csv"
# [8] "2012-gla-data-extract-casualty.csv"
# [9] "2013-gla-data-extract-casualty.csv"
#[10] "2014-gla-data-extract-casualty.csv"
#setwd(old_dir)

Here is the fully working code with explanations:
# List of File URLs
urlist <- list("https://tfl.gov.uk/cdn/static/cms/documents/2005-gla-data-extract-casualty.csv","https://tfl.gov.uk/cdn/static/cms/documents/2006-gla-data-extract-casualty.csv","https://tfl.gov.uk/cdn/static/cms/documents/2007-gla-data-extract-casualty.csv","https://tfl.gov.uk/cdn/static/cms/documents/2008-gla-data-extract-casualty.csv","https://tfl.gov.uk/cdn/static/cms/documents/2009-gla-data-extract-casualty.csv","https://tfl.gov.uk/cdn/static/cms/documents/2010-gla-data-extract-casualty.csv","https://tfl.gov.uk/cdn/static/cms/documents/2011-gla-data-extract-casualty.csv","https://tfl.gov.uk/cdn/static/cms/documents/2012-gla-data-extract-casualty.csv","https://tfl.gov.uk/cdn/static/cms/documents/2013-gla-data-extract-casualty.csv","https://tfl.gov.uk/cdn/static/cms/documents/2014-gla-data-extract-casualty.csv")
setwd("~/so")
#for loop to select each url and store them
for (i in 1: length(urlist)) {
#define path to download
#get filename from the url path
destfile<-basename(urlist[[i]])
#download current file
download.file(urlist[[i]]), destfile)
}

Related

Upload text document in R

I am trying to upload several text document into a data frame in R. My desired output is a matrix with two colums:
DOCUMENT
CONTENT
Document A
This is the content.
: ----
: -------:
Document B
This is the content.
: ----
: -------:
Document C
This is the content.
Within the column "CONTENT", all the text information from the text document (10-K report) shall be shown.
> setwd("C:/Users/folder")
> folder <- getwd()
> corpus <- Corpus(DirSource(directory = folder, pattern = "*.txt"))
This will create a corpus and I can tokenize it. But I don't achieve to convert to a data frame nor my desiret output.
Can somebody help me?
If you're only working with .txt files and your endgoal is a dataframe, then I think you can skip the corpus step and simply read in all your files as a list. The hard part is to get the names of the .txt files into a column called DOCUMENT, but this can be done in base R.
# make a reproducible example
a <- "this is a test"
b <- "this is a second test"
c <- "this is a third test"
write(a, "a.txt"); write(b, "b.txt"); write(c, "c.txt")
# get working dir
folder <- getwd()
# get names/locations of all files
filelist <- list.files(path = folder, pattern =" *.txt", full.names = FALSE)
# read in the files and put them in a list
lst <- lapply(filelist, readLines)
# extract the names of the files without the `.txt` stuff
names(lst) <- filelist
namelist <- fs::path_file(filelist)
namelist <- unlist(lapply(namelist, sub, pattern = ".txt", replacement = ""),
use.names = FALSE)
# give every matrix in the list its own name, which was its original file name
lst <- mapply(cbind, lst, "DOCUMENT" = namelist, SIMPLIFY = FALSE)
# combine into a dataframe
x <- do.call(rbind.data.frame, lst)
# a small amount of clean-up
rownames(x) <- NULL
names(x)[names(x) == "V1"] <- "CONTENT"
x <- x[,c(2,1)]
x
#> DOCUMENT CONTENT
#> 1 a this is a test
#> 2 b this is a second test
#> 3 c this is a third test

Select CSV files and read in pairs

I am comparing two pairs of csv files each at a time. The files I have each end with a number like cars_file2.csv, Lorries_file3.csv, computers_file4.csv, phones_file5.csv. I have like 70 files per folder and the way I am comparing is, I compare cars_file2.csv and Lorries_file3.csv then Lorries_file3.csv and
computers_file4.csv, and the pattern is 2,3,3,4,4,5 like that. Is there a smart way I can handle this instead of manually coming back and change file like the way I am reading here or I can use the last number on each csv to read them smartly. NOTE the files have same suffixes _file:
library(daff)
setwd("path")
# Load csvs to compare into data frames
x_original <- read.csv("cars_file2.csv", strip.white=TRUE, stringsAsFactors = FALSE)
x_changed <- read.csv("Lorries_file3.csv", strip.white=TRUE, stringsAsFactors = FALSE)
render(diff_data(x_original,x_changed ,ignore_whitespace=TRUE,count_like_a_spreadsheet = FALSE))
My intention is to compare each two pairs of csv and recorded, Field additions, deletions and modified
You may want to load all files at once and do your comparison with a full list of files.
This may help:
# your path
path <- "insert your path"
# get folders in this path
dir_data <- as.list(list.dirs(path))
# get all filenames
dir_data <- lapply(dir_data,function(x){
# list of folders
files <- list.files(x)
files <- paste(x,files,sep="/")
# only .csv files
files <- files[substring(files,nchar(files)-3,nchar(files)) %in% ".csv"]
# remove possible errors
files <- files[!is.na(files)]
# save if there are files
if(length(files) >= 1){
return(files)
}
})
# delete NULL-values
dir_data <- compact(dir_data)
# make it a named vector
dir_data <- unique(unlist(dir_data))
names(dir_data) <- sub(pattern = "(.*)\\..*$", replacement = "\\1", basename(dir_data))
names(dir_data) <- as.numeric(substring(names(dir_data),nchar(names(dir_data)),nchar(names(dir_data))))
# remove possible NULL-values
dir_data <- dir_data[!is.na(names(dir_data))]
# make it a list again
dir_data <- as.list(dir_data)
# load data
data_upload <- lapply(dir_data,function(x){
if(file.exists(x)){
data <- read.csv(x,header=T,sep=";")
}else{
data <- "file not found"
}
return(data)
})
# setup for comparison
diffs <- lapply(as.character(sort(as.numeric(names(data_upload)))),function(x){
# check if the second dataset exists
if(as.character(as.numeric(x)+1) %in% names(data_upload)){
# first dataset
print(data_upload[[x]])
# second dataset
print(data_upload[[as.character(as.numeric(x)+1)]])
# do your operations here
comparison <- render(diff_data(data_upload[[x]],
data_upload[[as.character(as.numeric(x)+1)]],
ignore_whitespace=T,count_like_a_spreadsheet = F))
numbers <- c(x, as.numeric(x)+1)
# save both the comparison data and the numbers of the datasets
return(list(comparison,numbers))
}
})
# you can find the differences here
diffs
This script loads all csv-files in a folder and its sub-folders and puts them into a list by their numbers. In case there are no doubles, this will work. If you have doubles, you will have to adjust the part where the vector is named so that you can index the full names of the files afterwards.
A simple for- loop using paste will read-in the pairs:
for (i in 1:70) { # assuming the last pair is cars_file70.csv and Lorries_file71.csv
x_original <- read.csv(paste0("cars_file",i,".csv"), strip.white=TRUE, stringsAsFactors = FALSE)
x_changed <- read.csv(paste0("Lorries_file3",i+1,".csv"), strip.white=TRUE, stringsAsFactors = FALSE)
render(diff_data(x_original,x_changed ,ignore_whitespace=TRUE,count_like_a_spreadsheet = FALSE))
}
For simplicity I used 2 .csv files.
csv_1
1,2,4
csv_2
1,8,10
Load all the .csv files from folder,
files <- dir("Your folder path", pattern = '\\.csv', full.names = TRUE)
tables <- lapply(files, read.csv)
#create empty list to store comparison output
diff <- c()
Loop through all loaded files and compare,
for (pos in 1:length(csv)) {
if (pos != length(csv)) { #ignore last one
#save comparison output
diff[[pos]] <- diff_data(as.data.frame(csv[pos]), as.data.frame(csv[pos + 1]), ignore_whitespace=TRUE,count_like_a_spreadsheet = FALSE)
}
}
Compared output by diff
[[1]]
Daff Comparison: ‘as.data.frame(tables[pos])’ vs. ‘as.data.frame(tables[pos + 1])’
+++ +++ --- ---
## X1 X8 X10 X2 X4

Get certain values from a list of pdfs

I would like to:
get certain data in page 2 for every element in a list created (pdfs files)
data from page 2 (for Bond Futures CGB ... column 2, 11 and 16)
create a data frame aggregating all this data
Year | Month | Metric
2013 January Monthly Volume
2013 January Month End Open Interest
2013 January Transactions
I have tried the following but haven't reached far at all - my apologies.
library(rvest)
library(pdftools)
library(tidyverse)
filepath <- "~R Working Directory/CanadianFutures"
files <- list.files(path = filepath, pattern = '*.pdf')
The variable files contains the list:
[1] "1301_stats_en.pdf" "1302_stats_en.pdf" "1303_stats_en.pdf" "1304_stats_en.pdf" "1305_stats_en.pdf" "1306_stats_en.pdf"
[7] "1307_stats_en.pdf" "1308_stats_en.pdf" "1309_stats_en.pdf" "1310_stats_en.pdf" "1311_stats_en.pdf" "1312_stats_en.pdf"
[13] "1401_stats_en.pdf" "1402_stats_en.pdf" "1403_stats_en.pdf" "1404_stats_en.pdf" "1405_stats_en.pdf" "1406_stats_en.pdf".....[61] "1801_stats_en.pdf" "1802_stats_en.pdf" "1803_stats_en.pdf" "1804_stats_en.pdf" "1805_stats_en.pdf"
I have tried the following to get page 2 for each pdf but totally lost:
all <- lapply(files, function(x) {
txt <- pdf_text(filenames)
page_2 <- txt[2]
})
I get the following:
Error in normalizePath(pdf, mustWork = TRUE) :
path[1]="1301_stats_en.pdf": No such file or directory
All the pdfs in my list have the same consistent formatting.
Here is an example of the pdf https://www.m-x.ca/f_stat_en/1401_stats_en.pdf
Thank you
Make sure your working directory is the same as where you stored your files:
getwd()
Another option is to make your list of files displayed as complete directories.
files <- list.files(filepath, pattern = '*.pdf', full.names = T)
>files
[1] "Downloads/naamloze map//1401_stats_en-2.pdf"
[2] "Downloads/naamloze map//1401_stats_en.pdf"
PDFreader <- function(x){
t <- pdf_text (x)
page_2 <- t
}
lapply(files, PDFreader)
returns
[[1]]
[1]..... text....
[[2]]
[1]..... text....
Good luck

read multiple text files into r for text mining purposes

I have a batch of text files that I need to read into r to do text mining.
So far, I have tried to use read.table, read.line, lapply, mcsv_r from qdap package to no avail. I have tried to write a loop to read the files, but I have to specify the name of the file, which changes in every iteration.
Here is what I have tried:
# Relative path points to the local folder
folder.path="../data/InauguralSpeeches/"
# get the list of file names
speeches=list.files(path = folder.path, pattern = "*.txt")
for(i in 1:length(speeches))
{
text_df <- do.call(rbind,lapply(speeches[i],read.csv))
}
Moreover, I have tried the following:
library(data.table)
files <- list.files(path = folder.path,pattern = ".csv")
temp <- lapply(files, fread, sep=",")
data <- rbindlist( temp )
And it is giving me this error when inaugAbrahamLincoln-1.csv clearly exists in the folder:
files <- list.files(path = folder.path,pattern = ".csv")
> temp <- lapply(files, fread, sep=",")
Error in FUN(X[[i]], ...) :
File 'inaugAbrahamLincoln-1.csv' does not exist. Include one or more spaces to consider the input a system command.
> data <- rbindlist( temp )
Error in rbindlist(temp) : object 'temp' not found
>
But it only works on .csv files, not on .txt files.
Is there a simpler way to do text mining from multiple sources files? If so how?
Thanks
I often have this same problem. The textreadr package that I maintain is designed to make reading .csv, .pdf, .doc, and .docx documents and directories of these documents easy. It would reduce what you're doing to:
textreadr::read_dir("../data/InauguralSpeeches/")
Your example is not reproducible so I do it below (please make your example reproducible in the future).
library(textreadr)
## Minimal working example
dir.create('delete_me')
file.copy(dir(system.file("docs/Maas2011/pos", package = "textreadr"), full.names=TRUE), 'delete_me', recursive=TRUE)
write.csv(mtcars, 'delete_me/mtcars.csv')
write.csv(CO2, 'delete_me/CO2.csv')
cat('test\n\ntesting\n\ntester', file='delete_me/00_00.txt')
## the read in of a directory
read_dir('delete_me')
output
The output below shows the tibble output with each document registered in the document column. For every line in the document there is one row for that document. Depending on what's in the csv files this may not be fine grained enough.
## document content
## 1 0_9 Bromwell High is a cartoon comedy. It ra
## 2 00_00 test
## 3 00_00
## 4 00_00 testing
## 5 00_00
## 6 00_00 tester
## 7 1_7 If you like adult comedy cartoons, like
## 8 10_9 I'm a male, not given to women's movies,
## 9 11_9 Liked Stanley & Iris very much. Acting w
## 10 12_9 Liked Stanley & Iris very much. Acting w
## .. ... ...
## 141 mtcars "Ferrari Dino",19.7,6,145,175,3.62,2.77,
## 142 mtcars "Maserati Bora",15,8,301,335,3.54,3.57,1
## 143 mtcars "Volvo 142E",21.4,4,121,109,4.11,2.78,18
Here is code that will read all the *.csv files in a directory to a single data.frame:
dir <- '~/Desktop/testcsv/'
files <- list.files(dir,pattern = '*.csv', full.names = TRUE)
data <- lapply(files, read.csv)
df <- do.call(rbind, data)
Notice that I added the argument full.names = TRUE. This will give you the absolute paths, which is why youre getting an error for "inaugAbrahamLincoln-1.csv" even though it exists.
Here is one way to do it.
library(data.table)
setwd("C:/Users/Excel/Desktop/CSV Files/")
WD="C:/Users/Excel/Desktop/CSV Files/"
# read headers
data<-data.table(read.csv(text="CashFlow,Cusip,Period"))
csv.list<- list.files(WD)
k=1
for (i in csv.list){
temp.data<-read.csv(i)
data<-data.table(rbind(data,temp.data))
if (k %% 100 == 0)
print(k/length(csv.list))
k<-k+1
}

Remove columns from the data stored in the "variable"

So I loaded few csv files:
tbl = list.files(pattern="*.csv")
> tbl
[1] "F1.csv" "F10.csv" "F11.csv" "F12.csv" "F13.csv" "F14.csv" "F15.csv" "F16.csv"
[9] "F17.csv" "F18.csv" "F19.csv" "F2.csv" "F20.csv" "F3.csv" "F4.csv" "F5.csv"
[17] "F6.csv" "F7.csv" "F8.csv" "F9.csv"
And now I would like to delete two columns from the F6.csv file. Those columns are 7 and 8.
How to delete those columns from:
tbl[17]
Can I access this data directly or I have to just load this table separately and remove those coulmns.
Per the note above, you have to read these files into R. Then, you can manipulate them once they're in your local environment
# read all of the files into R
for(i in list.files(pattern = '*.csv')){
name <- paste(i)
dat <- read.csv(i, header = T)
assign(name, dat)
}
# remove columns 7 & 8 from F6.csv
F6.csv <- F6.csv[, -c(7, 8)]
# if you want to write this new data set to F6.csv
# write.csv(F6.csv, 'F6.csv', row.names = F)

Resources