Add filename column to dataframe from multiple json imports - r

I've got multiple .json files which consist of dates. I would like to import all the .json files in R to create one dataframe and add a column that consist of the filenames.
2020-06-15.json:
[{"title":"Moral Machine","title_link":"http://moralmachine.mit.edu/"}]
2020-06-16.json:
[{"title":"De Monitor","title_link":"http://demonitor.ncrv.nl/"}]
Then I create a dataframe
test_path <- "data"
test_files <- list.files(test_path, pattern = "*.json")
test_files %>%
map_df(~fromJSON(file.path(test_path, .), flatten = TRUE))
Desired output:
title title_link file_name
1 Moral Machine http://moralmachine.mit.edu/ 2020-06-15.json
2 De Monitor http://demonitor.ncrv.nl/ 2020-06-16.json

Using rbindlist from data.table:
library(data.table)
file_names <- list.files(path = test_path, pattern = '.*json')
data_list <- lapply(file_names, function(z){
dat <- myFunction(z) #your function to read and clean json files
dat$file_name <- z
return(dat)
})
combined_data <- rbindlist(l = data_list, use.names = T, fill = T)
Since I don't know the structure of your JSON file, I'm assuming you have a function named myFunction to read and clean up the data.

library(jsonlite)
test_files_full <- list.files(test_path, pattern = "*.json",full.names=TRUE) # to get the full path string
test_files <- list.files(test_path, pattern = "*.json")
t(sapply(seq_along(test_files), function(x)
c(fromJSON(test_files_full[x]),file_name=test_files[x])))
gives,
title title_link file_name
[1,] "Moral Machine" "http://moralmachine.mit.edu/" "2020-06-15.json"
[2,] "De Monitor" "http://demonitor.ncrv.nl/" "2020-06-16.json"

Related

Converting text files to excel files in R

I have radiotelemetry data that is downloaded as a series of text files. I was provided with code in 2018 that looped through all the text files and converted them into CSV files. Up until 2021 this code worked. However, now the below code (specifically the lapply loop), returns the following error:
"Error in setnames(x, value) :
Can't assign 1 names to a 4 column data.table"
# set the working directory to the folder that contain this script, must run in RStudio
setwd(dirname(rstudioapi::callFun("getActiveDocumentContext")$path))
# get the path to the master data folder
path_to_data <- paste(getwd(), "data", sep = "/", collapse = NULL)
# extract .TXT file
files <- list.files(path=path_to_data, pattern="*.TXT", full.names=TRUE, recursive=TRUE)
# regular expression of the record we want
regex <- "^\\d*\\/\\d*\\/\\d*\\s*\\d*:\\d*:\\d*\\s*\\d*\\s*\\d*\\s*\\d*\\s*\\d*"
# vector of column names, no whitespace
columns <- c("Date", "Time", "Channel", "TagID", "Antenna", "Power")
# loop through all .TXT files, extract valid records and save to .csv files
lapply(files, function(x){
df <- read_table(file) # read the .TXT file to a DataFrame
dt <- data.table(df) # convert the dataframe to a more efficient data structure
colnames(dt) <- c("columns") # modify the column name
valid <- dt %>% filter(str_detect(col, regex)) # filter based on regular expression
valid <- separate(valid, col, into = columns, sep = "\\s+") # split into columns
towner_name <- str_sub(basename(file), start = 1 , end = 2) # extract tower name
valid$Tower <- rep(towner_name, nrow(valid)) # add Tower column
file_path <- file.path(dirname(file), paste(str_sub(basename(file), end = -5), ".csv", sep=""))
write.csv(valid, file = file_path, row.names = FALSE, quote = FALSE) # save to .csv
})
I looked up possible fixes for this and found using "setnames(skip_absent=TRUE)" in the loop resolved the setnames error but instead gave the error "Error in is.data.frame(x) : argument "x" is missing, with no default"
lapply(files, function(file){
df <- read_table(file) # read the .TXT file to a DataFrame
dt <- data.table(df) # convert the dataframe to a more efficient data structure
setnames(skip_absent = TRUE)
colnames(dt) <- c("col") # modify the column name
valid <- dt %>% filter(str_detect(col, regex)) # filter based on regular expression
valid <- separate(valid, col, into = columns, sep = "\\s+") # split into columns
towner_name <- str_sub(basename(file), start = 1 , end = 2) # extract tower name
valid$Tower <- rep(towner_name, nrow(valid)) # add Tower column
file_path <- file.path(dirname(file), paste(str_sub(basename(file), end = -5), ".csv", sep=""))
write.csv(valid, file = file_path, row.names = FALSE, quote = FALSE) # save to .csv
})
I'm confused at to why this code is no longer working despite working fine last year? Any help would be greatly appreciated!
The error occured at this line colnames(dt) <- c("columns") where you provided only one value to rename the (supposedly) 4-column dataframe. If you meant to replace a particular column, you can
colnames(dt)[i] <- c("columns")
where i is the index of the column you are renaming. Alternatively, provide a vector with 4 new names.

Read Multiple txt files in an order and combine them into one dataframe but label the origin of each row in the new generated dataframe in r

I have 6 txt files and I want to combine them into 1 dataframe. I know how to read them simultaneously and combine them in default way.
I learned to do this in this website:
txt_files_ls = list.files(path=mypath, pattern="*.txt")
txt_files_df <- lapply(txt_files_ls, function(x) {read.table(file = x, header = T, sep ="\t")})
# Combine them
combined_df <- do.call("rbind", lapply(txt_files_df, as.data.frame))
Now I want to do is set the read.table to read the txt files in a sequential manner as i defined, So that after combining them, I will be able to labeled the rows with the name of their original txt file name. Thank you
You can try this:
txt_files_ls = list.files(path=mypath, pattern="*.txt")
#The function for reading
read.data <- function(x)
{
y <- read.table(file = x, header = T, sep ="\t")
y$var <- x
return(y)
}
#Read data
txt_files_df <- lapply(txt_files_ls,read.data)
# Combine them
combined_df <- do.call("rbind", lapply(txt_files_df, as.data.frame))
Where var contains the name of each file.

import multible CSV files and use file name as column [duplicate]

This question already has answers here:
Importing multiple .csv files into R and adding a new column with file name
(2 answers)
Closed 14 days ago.
I have numerous csv files in multiple directories that I want to read into a R tribble or data.table. I use "list.files()" with the recursive argument set to TRUE to create a list of file names and paths, then use "lapply()" to read in multiple csv files, and then "bind_rows()" stick them all together:
filenames <- list.files(path, full.names = TRUE, pattern = fileptrn, recursive = TRUE)
tbl <- lapply(filenames, read_csv) %>%
bind_rows()
This approach works fine. However, I need to extract a substring from the each file name and add it as a column to the final table. I can get the substring I need with "str_extract()" like this:
sites <- str_extract(filenames, "[A-Z]{2}-[A-Za-z0-9]{3}")
I am stuck however on how to add the extracted substring as a column as lapply() runs through read_csv() for each file.
I generally use the following approach, based on dplyr/tidyr:
data = tibble(File = files) %>%
extract(File, "Site", "([A-Z]{2}-[A-Za-z0-9]{3})", remove = FALSE) %>%
mutate(Data = lapply(File, read_csv)) %>%
unnest(Data) %>%
select(-File)
tidyverse approach:
Update:
readr 2.0 (and beyond) now has built-in support for reading a list of files with the same columns into one output table in a single command. Just pass the filenames to be read in the same vector to the reading function. For example reading in csv files:
(files <- fs::dir_ls("D:/data", glob="*.csv"))
dat <- read_csv(files, id="path")
Alternatively using map_dfr with purrr:
Add the filename using the .id = "source" argument in purrr::map_dfr()
An example loading .csv files:
# specify the directory, then read a list of files
data_dir <- here("file/path")
data_list <- fs::dir_ls(data_dir, regexp = ".csv$")
# return a single data frame w/ purrr:map_dfr
my_data = data_list %>%
purrr::map_dfr(read_csv, .id = "source")
# Alternatively, rename source from the file path to the file name
my_data = data_list %>%
purrr::map_dfr(read_csv, .id = "source") %>%
dplyr::mutate(source = stringr::str_replace(source, "file/path", ""))
You could use purrr::map2 here, which works similarly to mapply
filenames <- list.files(path, full.names = TRUE, pattern = fileptrn, recursive = TRUE)
sites <- str_extract(filenames, "[A-Z]{2}-[A-Za-z0-9]{3}") # same length as filenames
library(purrr)
library(dplyr)
library(readr)
stopifnot(length(filenames)==length(sites)) # returns error if not the same length
ans <- map2(filenames, sites, ~read_csv(.x) %>% mutate(id = .y)) # .x is element in filenames, and .y is element in sites
The output of map2 is a list, similar to lapply
If you have a development version of purrr, you can use imap, which is a wrapper for map2 with an index
data.table approach:
If you name the list, then you can use this name to add to the data.table when binding the list together.
workflow
files <- list.files( whatever... )
#read the files from the list
l <- lapply( files, fread )
#names the list using the basename from `l`
# this also is the step to manipuly the filesnamaes to whatever you like
names(l) <- basename( l )
#bind the rows from the list togetgher, putting the filenames into the colum "id"
dt <- rbindlist( dt.list, idcol = "id" )
You just need to write your own function that reads the csv and adds the column you want, before combining them.
my_read_csv <- function(x) {
out <- read_csv(x)
site <- str_extract(x, "[A-Z]{2}-[A-Za-z0-9]{3}")
cbind(Site=site, out)
}
filenames <- list.files(path, full.names = TRUE, pattern = fileptrn, recursive = TRUE)
tbl <- lapply(filenames, my_read_csv) %>% bind_rows()
You can build a filenames vector based on "sites" with the exact same length as tbl and then combine the two using cbind
### Get file names
filenames <- list.files(path, full.names = TRUE, pattern = fileptrn, recursive = TRUE)
sites <- str_extract(filenames, "[A-Z]{2}-[A-Za-z0-9]{3}")
### Get length of each csv
file_lengths <- unlist(lapply(lapply(filenames, read_csv), nrow))
### Repeat sites using lengths
file_names <- rep(sites,file_lengths))
### Create table
tbl <- lapply(filenames, read_csv) %>%
bind_rows()
### Combine file_names and tbl
tbl <- cbind(tbl, filename = file_names)

R: Merging data frames based on file name

My files (example as I have hundreds of these files):
France.csv
France_variables.csv
Germany.csv
Germany_variables.csv
Spain.csv
Spain_variables.csv
Portugal.csv
Portugal_variables.csv
I want to merge France with France_variables, Germany with Germany_variables etc. I know I can use rbind with the two files but I want to do this as a loop because I have lots of these to merge. I'm not sure how to do a string search and then rbind in a loop or if there is a better way of doing this.
I am new to R so any help would be much appreciated.
You can use something like this:
library(tidyverse)
#Get Unique countries
country <- unique(gsub('\\..*$|_.*', '', list.files(path = ".", pattern = "csv")))
#Loop
for (i in country) {
dat <- list.files(path = ".", pattern = i) %>% map(read_csv) %>% reduce(rbind)
assign( paste("df", i, sep = "_"), dat)
rm(dat)
}
This will create dataframes like df_France, df_Germany , etc.
Play with the 'grepl', and see if you can get this to work......
# set the working directory (where files are saved)
setwd("C:/your_path_here/")
file_names = list.files(getwd())
file_names = file_names[grepl(".TXT",file_names)]
# print file_names vector
file_names
# see the data structure
str(file)
# run read.csv on all values of file_names
files = lapply(file_names, read.csv, header=F, stringsAsFactors = F)
files = do.call(rbind,files)
# run only on WY.TXT and NM.TXT
str(files)
# set column names
names(files) = c("col1", "col2", "col3", "col4", "col5")
str(files)
# finally...
write.table(files, "C:/your_path/mydata.txt", sep="\t")
write.csv(files,"C:/your_path/mydata.csv")
http://www.rforexcelusers.com/combine-delimited-files-r/

Read in multiple files and append file name to data frame [duplicate]

This question already has answers here:
Importing multiple .csv files into R and adding a new column with file name
(2 answers)
Closed 15 days ago.
I have numerous csv files in multiple directories that I want to read into a R tribble or data.table. I use "list.files()" with the recursive argument set to TRUE to create a list of file names and paths, then use "lapply()" to read in multiple csv files, and then "bind_rows()" stick them all together:
filenames <- list.files(path, full.names = TRUE, pattern = fileptrn, recursive = TRUE)
tbl <- lapply(filenames, read_csv) %>%
bind_rows()
This approach works fine. However, I need to extract a substring from the each file name and add it as a column to the final table. I can get the substring I need with "str_extract()" like this:
sites <- str_extract(filenames, "[A-Z]{2}-[A-Za-z0-9]{3}")
I am stuck however on how to add the extracted substring as a column as lapply() runs through read_csv() for each file.
I generally use the following approach, based on dplyr/tidyr:
data = tibble(File = files) %>%
extract(File, "Site", "([A-Z]{2}-[A-Za-z0-9]{3})", remove = FALSE) %>%
mutate(Data = lapply(File, read_csv)) %>%
unnest(Data) %>%
select(-File)
tidyverse approach:
Update:
readr 2.0 (and beyond) now has built-in support for reading a list of files with the same columns into one output table in a single command. Just pass the filenames to be read in the same vector to the reading function. For example reading in csv files:
(files <- fs::dir_ls("D:/data", glob="*.csv"))
dat <- read_csv(files, id="path")
Alternatively using map_dfr with purrr:
Add the filename using the .id = "source" argument in purrr::map_dfr()
An example loading .csv files:
# specify the directory, then read a list of files
data_dir <- here("file/path")
data_list <- fs::dir_ls(data_dir, regexp = ".csv$")
# return a single data frame w/ purrr:map_dfr
my_data = data_list %>%
purrr::map_dfr(read_csv, .id = "source")
# Alternatively, rename source from the file path to the file name
my_data = data_list %>%
purrr::map_dfr(read_csv, .id = "source") %>%
dplyr::mutate(source = stringr::str_replace(source, "file/path", ""))
You could use purrr::map2 here, which works similarly to mapply
filenames <- list.files(path, full.names = TRUE, pattern = fileptrn, recursive = TRUE)
sites <- str_extract(filenames, "[A-Z]{2}-[A-Za-z0-9]{3}") # same length as filenames
library(purrr)
library(dplyr)
library(readr)
stopifnot(length(filenames)==length(sites)) # returns error if not the same length
ans <- map2(filenames, sites, ~read_csv(.x) %>% mutate(id = .y)) # .x is element in filenames, and .y is element in sites
The output of map2 is a list, similar to lapply
If you have a development version of purrr, you can use imap, which is a wrapper for map2 with an index
data.table approach:
If you name the list, then you can use this name to add to the data.table when binding the list together.
workflow
files <- list.files( whatever... )
#read the files from the list
l <- lapply( files, fread )
#names the list using the basename from `l`
# this also is the step to manipuly the filesnamaes to whatever you like
names(l) <- basename( l )
#bind the rows from the list togetgher, putting the filenames into the colum "id"
dt <- rbindlist( dt.list, idcol = "id" )
You just need to write your own function that reads the csv and adds the column you want, before combining them.
my_read_csv <- function(x) {
out <- read_csv(x)
site <- str_extract(x, "[A-Z]{2}-[A-Za-z0-9]{3}")
cbind(Site=site, out)
}
filenames <- list.files(path, full.names = TRUE, pattern = fileptrn, recursive = TRUE)
tbl <- lapply(filenames, my_read_csv) %>% bind_rows()
You can build a filenames vector based on "sites" with the exact same length as tbl and then combine the two using cbind
### Get file names
filenames <- list.files(path, full.names = TRUE, pattern = fileptrn, recursive = TRUE)
sites <- str_extract(filenames, "[A-Z]{2}-[A-Za-z0-9]{3}")
### Get length of each csv
file_lengths <- unlist(lapply(lapply(filenames, read_csv), nrow))
### Repeat sites using lengths
file_names <- rep(sites,file_lengths))
### Create table
tbl <- lapply(filenames, read_csv) %>%
bind_rows()
### Combine file_names and tbl
tbl <- cbind(tbl, filename = file_names)

Resources