I'm encountering issue using the below script. All are working fine except for the final line which results to the error below.
# read dbf
library(foreign)
setwd("C:/Users/JGGliban/Desktop/Work/ADMIN/Other Stream/PH")
# Combine multiple dbf files
# library('tidyverse')
# List all files ending with dbf in directory
dbf_files <- list.files(pattern = c("*.DBF","*.dbf"), full.names = TRUE)
# Read each dbf file into a list
dbf_list <- lapply(dbf_files, read.dbf, as.is = FALSE)
# Concatenate the data in each dbf file into one combined data frame
data <- do.call(rbind, dbf_list)
# Write dbf file - max-nchar is the maimum number of characters allowed in a character field. After the max, it will be truncated.
x <- write.dbf(data, file, factor2char = TRUE, max_nchar = 254)
Code modified to:
x <- write.dbf(data, "file.dbf", factor2char = TRUE, max_nchar = 254)
Related
I am using this code in R to change the name of the fourth column of an excel file (thanks to this: Change column name with file name of corresponding excel file), however the problem is that at the end the file is overwritten and it generates a corrupted excel file (Excel file format and extension don’t match). How it is possible to create a new excel file a the end (not overwritten) or do you have any other solution to not create a corrupted file?
filenames <- list.files(pattern = '\\.xlsx', full.names = TRUE)
lapply(filenames, function(x) {
#Read the data
data <- readxl::read_excel(x)
#Change the 4th column with filename
names(data)[4] <- tools::file_path_sans_ext(basename(x))
#Write the data back
writexl::write_xlsx(data, x)
})
You can use tools::file_path_sans_ext and tools:.file_ext to put together a new output filename. In the code below, I append the suffix _new to the file name before the file extension.
filenames <- list.files(pattern = '\\.xlsx', full.names = TRUE)
lapply(filenames, function(x) {
#Read the data
data <- readxl::read_excel(x)
#Change the 4th column with filename
y <- tools::file_path_sans_ext(basename(x))
names(data)[4] <- y
#Write the data back
ext <- tools::file_ext(x)
y <- paste0(y, "_new.", ext)
writexl::write_xlsx(data, y)
})
I am using RStudio (running R 4.0.1) and Stata 12 for Windows and have got a large number of folders with Stata 16 .dta files (and other types of files not relevant to this question). I want to create an automated process of converting all Stata 16 .dta files into Stata 12 format (keeping all labels) to then analyze.
Ideally, I want to keep the names of the original folders and files but save the converted versions into a new location.
This is what I have got so far:
setwd("C:/FilesLocation")
#vector with name of files to be converted
all_files <- list.files(pattern="*.dta",full.names = TRUE)
for (i in all_files){
#Load file to be converted into STATA12 version
data <- read_dta("filename.dta",
encoding = NULL,
col_select = NULL,
skip = 0,
n_max = Inf,
.name_repair = "unique")
#Write as .dta
write_dta(data,"c:/directory/filename.dta", version = 12, label = attr(data, "label"))
}
Not sure this is the best approach. I know the commands inside the loop are working for a single file but not really being able to automate for all files.
Your code only needs some very minor modifications. I've indicated the changes (along with comments explaining them) in the snippet below.
library(haven)
mypath <- "C:/FilesLocation"
all_files <- list.files(path = mypath, pattern = "*.dta", full.names = TRUE)
for (i in 1:length(all_files)){
#(Above) iterations need the length of the vector to be specified
#Load file to be converted into STATA12 version
data <- read_dta(all_files[i], #You want to read the ith element in all_files
encoding = NULL,
col_select = NULL,
skip = 0,
n_max = Inf,
.name_repair = "unique")
#Add a _v12 to the filename to
#specify that is is version 12 now
new_fname <- paste0(unlist(strsplit(basename(all_files[i]), "\\."))[1],
"_v12.", unlist(strsplit(basename(all_files[i]), "\\."))[2])
#Write as .dta
#with this new filename
write_dta(data, path = paste0(mypath, "/", new_fname),
version = 12, label = attr(data, "label"))
}
I tried this out with some .sta files from here, and the script ran without throwing up errors. I haven't tested this on Windows but in theory it should work fine.
Edit: here is a more complete solution with read_dta and write_dta wrapped into a single function dtavconv. This function also allows the user to convert version numbers to arbitrary values (default is 12).
#----
#.dta file version conversion function
dtavconv <- function(mypath = NULL, myfile = NULL, myver = 12){
#Function to convert .dta file versions
#Default version files are converted to is v12
#Default directory is whatever is specified by getwd()
if(is.null(mypath)) mypath <- getwd()
#Main code block wrapped in a tryCatch()
myres <- tryCatch(
{
#Load file to be converted into STATA12 version
data <- haven::read_dta(paste0(mypath, "/", myfile),
encoding = NULL,
col_select = NULL,
skip = 0,
n_max = Inf,
.name_repair = "unique")
#Add a _v12 to the filename to
#specify that is is version 12 now
new_fname <- paste0(unlist(strsplit(basename(myfile), "\\."))[1],
"_v", myver, ".", unlist(strsplit(basename(myfile), "\\."))[2])
#Write as .dta
#with this new filename
haven::write_dta(data, path = paste0(mypath, "/", new_fname),
version = myver, label = attr(data, "label"))
message("\nSuccessfully converted ", myfile, " to ", new_fname, "\n")
},
error = function(cond){
#message("Unable to write file", myfile, " as ", new_fname)
message("\n", cond, "\n")
return(NA)
}
)
return(myres)
}
#----
The function can then be run on as many files as desired by invoking it via lapply or a for loop, as the example below illustrates:
#----
#Example run
library(haven)
#Set your path here below
mypath <- paste0(getwd(), "/", "dta")
#Check to see if this directory exists
#if not, create it
if(!dir.exists(mypath)) dir.create(mypath)
list.files(mypath)
# character(0)
#----
#Downloading some valid example files
myurl <- c("http://www.principlesofeconometrics.com/stata/airline.dta",
"http://www.principlesofeconometrics.com/stata/cola.dta")
lapply(myurl, function(x){ download.file (url = x, destfile = paste0(mypath, "/", basename(x)))})
#Also creating a negative test case
file.create(paste0(mypath, "/", "anegcase.dta"))
list.files(mypath)
# [1] "airline.dta" "anegcase.dta" "cola.dta"
#----
#Getting list of files in the directory
all_files <- list.files(path = mypath, pattern = "*.dta")
#Converting files using dtavconv via lapply
res <- lapply(all_files, dtavconv, mypath = mypath)
#
# Successfully converted airline.dta to airline_v12.dta
#
#
# Error in df_parse_dta_file(spec, encoding, cols_skip, n_max, skip,
# name_repair = .name_repair): Failed to parse /my/path/
# /dta/anegcase.dta: Unable to read from file.
#
#
#
# Successfully converted cola.dta to cola_v12.dta
#
list.files(mypath)
# [1] "airline_v12.dta" "airline.dta" "anegcase.dta" "cola_v12.dta"
# "cola.dta"
#Example for converting to version 14
res <- lapply(all_files, dtavconv, mypath = mypath, myver = 14)
#
# Successfully converted airline.dta to airline_v14.dta
#
#
# Error in df_parse_dta_file(spec, encoding, cols_skip, n_max, skip,
# name_repair = .name_repair): Failed to parse /my/path
# /dta/anegcase.dta: Unable to read from file.
#
#
#
# Successfully converted cola.dta to cola_v14.dta
#
list.files(mypath)
# [1] "airline_v12.dta" "airline_v14.dta" "airline.dta" "anegcase.dta"
# "cola_v12.dta" "cola_v14.dta" "cola.dta"
#----
I am new to R and I want to batch process all files in a working directory.
I have lots of .txt files and want to read them in, calculate a frequency of one Column, calculate percentage and a so called "H-Score", calculate the sum of the H-Score and store it in a vector. Then the next .txt file should be processed and so on.
After all files are processed, I want to write the vector in another .txt file as a result. The final .txt file should also contain the name of the input file and the calculated sum of H-Score. This is what I have so far, but as you can see, I am a absolute Newbie to programming and R...
setwd("~/Desktop/Automated Analysis/TXT/") # Set working directory
# List all txt files including sub-folders
list_of_files <- list.files(path = ".", recursive = TRUE,
pattern = "\\.txt$", full.names = TRUE)
library(data.table)
# Read all the files and create a FileName column to store filenames
DT <- rbindlist( sapply(list_of_files, fread, simplify = FALSE),
use.names = TRUE, idcol = "FileName" )
br = c(0,1,3,9,15,500) # Set breaks
bins = c(0,1,2,3,4) # Set bins
for (k in 1:length(list_of_files)) { # process all the files in the working directory
HScore_list = c() # create a vector for storing the results
for(i in 1:5) { my_vector = c(HScore_list,i) }
freq = hist(Count, breaks=br, plot=FALSE)
df = data.frame(bins, frequency=freq$counts,
df$percent=df$frequency / sum(df$frequency) * 100,
df$HScore=df$percent * df$bins)
HScore = sum(df$HScore)
}
write(HScore_list, "HScore_list.txt", sep="\n")
Do you know what I want and can help me?
EDIT: My Problem is, that the Code is producing no output.
Some background for my question: This is an R script that a previous research assistant wrote, but he did not provide any guidance to me on using it for myself. After working through an R textbook, I attempted to use the code on my data files.
What this code is supposed to do is load multiple .csv files, delete certain items/columns from them, and then write the new cleaned .csv files to a specified directory.
Currently, the files are being created in the right directory with the right file name, but the .csv files that are being created are empty.
I am currently getting the following error message:
Warning in
fread(input = paste0("data/", str_match(pattern = "CAFAS|PECFAS",: Starting data input on line 2 and discarding line 1 because it has too few or too many items to be column names or data: (variable names).
This is my code:
library(data.table)
library(magrittr)
library(stringr)
# create a function to delete unnecessary variables from a CAFAS or PECFAS
data set and save the reduced copy
del.items <- function(file){
data <- fread(input = paste0("data/", str_match(pattern = "CAFAS|PECFAS",
string = file) %>% tolower, "/raw/", file), sep = ",", header = TRUE,
na.strings = "", stringsAsFactors = FALSE, skip = 0, colClasses =
"character", data.table = FALSE)
data <- data[-grep(pattern = "^(CA|PEC)FAS_E[0-9]+(TR?(Initial|[0-
9]+|Exit)|SP[a-z])_(G|S|Item)[0-9]+$", x = names(data))]
write.csv(data, file = paste0("data/", str_match(pattern = "CAFAS|PECFAS",
string = file) %>% tolower, "/items-del/", sub(pattern = "ExportData_", x =
file, replacement = "")) %>% tolower, row.names = FALSE)
}
# delete items from all cafas data sets
cafas.files <- list.files("data/cafas/raw", pattern = ".csv")
for (file in cafas.files){
del.items(file)
}
# delete items from all pecfas data sets
pecfas.files <- list.files("data/pecfas/raw", pattern = ".csv")
for (file in pecfas.files){
del.items(file)
}
I would like to read multiple text files from my directory the files are arranged in following format
regional_vol_GM_atlas1.txt
regional_vol_GM_atlas2.txt
........
regional_vol_GM_atlas152.txt
Data from the files looks in following format
667869 667869
580083 580083
316133 316133
3631 3631
following is the script that i have written
library(readr)
library(stringr)
library(data.table)
array <- c()
for (file in dir(/media/dev/Daten/Task1/subject1/t1)) # path to the directory where .txt files are located
{
row4 <- read.table(file=list.files(pattern ="regional_vol*.txt"),
header = FALSE,
row.names = NULL,
skip = 3, # Skip the 1st 3 rows
nrows = 1, # Read only the next row after skipping the 1st 3 rows
sep = "\t") # change the separator if it is not "\t"
array <- cbind(array, row4)
}
I am incurring following error
Error in file(file, "rt") : invalid 'description' argument
kindly suggest me where i was wrong in the script
This seems to work fine for me. Make changes as per code comments in case files have headers :
[Answer Edited to reflect new information posted by OP]
# rm(list=ls()) #clean memory if you can afford to
mydir<- "~/Desktop/a" #change as per your path
# read full paths
myfiles<- list.files(mydir,pattern = "regional_vol*",full.names=T)
myfiles #check that files listed correctly
# initialise the dataframe from first file
# change header =T/F depending on presence of header
# make sure sep is correct
df<- read.csv( myfiles[1], header = F, skip = 0, nrows = 4, sep="" )[-c(1:3),]
#check that first line was read correctly
df
#read all the other files and update dataframe
#we read 4 lines to read the header correctly, then remove 3
ans<- lapply(myfiles[-1], function(x){ read.csv( x, header = F, skip = 0, nrows = 4, sep="")[-c(1:3),] })
ans
#update dataframe
lapply(ans, function(x){df<<-rbind(df,x)} )
#this should be the required dataframe
df
Also, if you are on Linux, a much simple method would be to simply make the OS do it for you
awk 'FNR == 4' regional_vol*.txt
This should do it for you.
# set the working directory (where files are saved)
setwd("C:/Users/your_path_here/Desktop/")
file_names = list.files(getwd())
file_names = file_names[grepl(".TXT",file_names)]
# print file_names vector
file_names
# read the WY.TXT file, just for testing
# file = read.csv("C:/Users/your_path_here/Desktop/regional_vol_GM_atlas1.txt", header=F, stringsAsFactors=F)
# see the data structure
str(file)
# run read.csv on all values of file_names
files = lapply(file_names, read.csv, header=F, stringsAsFactors = F)
files = do.call(rbind,files)
# set column names
names(files) = c("field1", "field2", "field3", "field4", "field5")
str(files)
write.table(files, "C:/Users/your_path_here/Desktop/mydata.txt", sep="\t")
write.csv(files,"C:/Users/your_path_here/Desktop/mydata.csv")