Remove path from variable name in a dataframe

Remove path from variable name in a dataframe - r

I've put together a function that looks like this, with the first comment lines being an example. Most importantly here is the set.path variable that I use to set the path initially for the function.
# igor.import(set.path = "~/Desktop/Experiment1 Folder/SCNavigator/Traces",
# set.pattern = "StepsCrop.ibw",
# remove.na = TRUE)
igor.multifile.import <- function(set.path, set.pattern, remove.na){
{
require("IgorR")
require("reshape2")
raw_list <- list.files(path= set.path,
pattern= set.pattern,
recursive= TRUE,
full.names=TRUE)
multi.read <- function(f) { # Note that "temp.data" is just a placeholder in the function
temp_data <- as.vector(read.ibw(f)) # Change extension to match your data type
}
my_list <- sapply(X = raw_list, FUN = multi.read) # Takes all files gathered in raw_list and applies multi.read()
my_list_combined <- as.data.frame(do.call(rbind, my_list))
my_list_rotated <- t(my_list_combined[nrow(my_list_combined):1,]) # Matrix form
data_out <- melt(my_list_rotated) # "Long form", readable by ggplot2
data_out$frame <- gsub("V", "", data_out$Var1)
data_out$name <- gsub(set.path, "", data_out$Var2) # FIX THIS
}
if (remove.na == TRUE){
set_name <- na.omit(data_out)
} else if (remove.na == FALSE) {
set_name <- data_out
} else (set_name <- data_out)
}
When I run this function I'll get a large dataframe, where each file that matched the pattern will show up with a name like
/Users/Joh/Desktop/Experiment1 Folder/SCNavigator/Traces/Par994/StepsCrop.ibw`
that includes the entire filepath, and is a bit unwieldy to look at and deal with.
I've tried to remove the path part with the line that says
data_out$name <- gsub(set.path, "", data_out$Var2)
Similar to the command above that removes the dataframe auto-named V1, V2, V3... (which works). I can't remove the string part matching the set.path = "my/path/" though.

Regardless of what your set.path is, you can eliminate it by
gsub(".*/","",mypath)
mypath<-"/Users/Joh/Desktop/Experiment1 Folder/SCNavigator/Traces/Par994/StepsCrop.ibw"
gsub(".*/","",mypath)
[1] "StepsCrop.ibw"
`

Related

List elements getting overwritten in for loop R?

I have a bunch of csv files that I'm trying to read into R all at once, with each data frame from a csv becoming an element of a list. The loops largely work, but they keep overriding the list elements. So, for example, if I loop over the first 2 files, both data frames in list[[1]] and list[[2]] will contain the data frame for the second file.
#function to open one group of files named with "cores"
open_csv_core<- function(year, orgtype){
file<- paste(year, "/coreco.core", year, orgtype, ".csv", sep = "")
df <- read.csv(file)
names(df) <- tolower(names(df))
df <- df[df$ntee1 %in% c("C","D"),]
df<- df[!(df$nteecc %in% c("D20","D40", "D50", "D60", "D61")),]
return(df)
}
#function to open one group of files named with "nccs"
open_csv_nccs<- function(year, orgtype){
file2<- paste(year, "/nccs.core", year, orgtype, ".csv", sep="")
df2 <- read.csv(file2)
names(df2) <- tolower(names(df2))
df2 <- df2[df2$ntee1 %in% c("C","D"),]
df2<- df2[!(df2$nteecc %in% c("D20","D40", "D50", "D60", "D61")),]
return(df2)
}
#############################################################################
yrpc<- list()
yrpf<- list()
yrco<- list()
fname<- vector()
file_yrs<- as.character(c(1989:2019))
for(i in 1:length(file_yrs)){
fname<- list.files(path = file_yrs[i], pattern = NULL)
#accessing files in a folder and assigning to the proper function to open them based on how the file is named
for(j in 1:length(fname)){
if(grepl("pc.csv", fname[j])==T) {
if(grepl("nccs", fname[j])==T){
a <- open_csv_nccs(file_yrs[j], "pc")
yrpc[[paste0(file_yrs[i], "pc")]] <- a
} else {
b<- open_csv_core(file_yrs[j], "pc")
yrpc[[paste0(file_yrs[i], "pc")]] <- b
}
} else if (grepl("pf.csv", fname[j])==T){
if(grepl("nccs", fname[j])==T){
c <- open_csv_nccs(file_yrs[j], "pf")
yrpf[[paste0(file_yrs[i], "pf")]] <- c
} else {
d<- open_csv_core(file_yrs[j], "pf")
yrpf[[paste0(file_yrs[i], "pf")]] <- d
}
} else {
if(grepl("nccs", fname[j])==T){
e<- open_csv_nccs(file_yrs[j], "co")
yrco[[paste0(file_yrs[i], "co")]] <- e
} else {
f<- open_csv_core(file_yrs[j], "co")
yrco[[paste0(file_yrs[i], "co")]] <- f
}
}
}
}

Actually, both of your csv reading functions do exactly the same,
except that the paths are different.
If you find a way to list your files with abstract paths instead of relative
paths (just the file names), you wouldn't need to reconstruct the paths like
you do. This is possible by full.names = TRUE in list.files().
The second point is, it seems there is never from same year and same type
a "nccs.core" file in addition to a "coreco.core" file. So they are mutually
exclusive. So then, there is no logics necessary to distinguish those cases, which simplifies our code.
The third point is, you just want to separate the data frames by filetype ("pc", "pf", "co") and years.
Instead of creating 3 lists for each type, I would create one res-ults list, which contains for each type an inner list.
I would solve this like this:
years <- c(1989:2019)
path_to_type <- function(path) gsub(".*(pc|pf|co)\\.csv", "\\1", path)
res <- list("pc" = list(),
"pf" = list(),
"co" = list())
lapply(years, function(year) {
files <- list.files(path = year, pattern = "\\.csv", full.names = TRUE)
dfs <- lapply(files, function(path) {
print(path) # just to signal that the path is getting processed
df <- read.csv(path)
file_type <- path_to_type(path)
names(df) <- tolower(names(df))
df <- df[df$ntee1 %in% c("C", "D"), ]
df <- df[!(df$nteecc %in% c("D20", "D40", "D50", "D60", "D61")), ]
res[[file_type]][[year]] <- df
})
})
Now you can call from result's list by file_type and year
e.g.:
res[["co"]][[1995]]
res[["pf"]][[2018]]
And so on.
Actually, the results of the lapply() calls in this case are
not interesting. Just the content of res ... (result list).

It seems that in your for(j in 1:length(fname)){... you are creating one of 4 variable a, b, c or d. And you're reusing these variable names, so they are getting overwritten.
The "correct" way to do this is to use lapply in place of the for loop. Pass the list of files, and the required function (i.e. open_csv_core, etc) to lapply, and the return value that you get back is a list of the results.

Breaking the golden rules on FOR loops with R

Firstly, apologies as this may seem a bit long winded, but I hope to give as much information on this problem as I can...
I have written a script that loops through a set of files defined in a csv file. Each file within this csv listing is an XML file, each one is for a particular event in an application, and all files within this list are of the same event type. However, each file can contain different data. For instance, one could hold an attribute with no child nodes beneath, while others contain nodes.
My script works perfectly fine, but when it gets to about XML file 5000, it has slowed down considerably.
Problem is that my code creates a blank dataframe initially, and then grows is at new columns are detected.
I understand that this is a big NO NO when it comes to writing R FOR loops, but am unsure how to get around this problem, give my smallest file listing is 69000, which makes going through each one in turn and counting the nodes a task in itself.
Are there any ideas on how to get around this?
pseudo code or actual R code to do this would be great. So would ideas/opinions, as I am unsure on the best approach to this task.
Here is my current code.
library(XML)
library(xml2)
library(plyr)
library(tidyverse)
library(reshape2)
library(foreign)
library(rio)
# Get file data to be used
#
setwd('c:/temp/xml')
headerNames <- c('GUID','EventId','AppId','RequestFile', 'AE_Type', 'AE_Drive')
GetNames <- rowid_to_column(read.csv(file= 'c:/temp/xml/R_EventIdA.csv', fileEncoding="UTF-8-BOM", header = FALSE, col.names = headerNames),'ID')
inputfiles <- as.character(GetNames[,5]) # Gets list of files
# Create empty dataframes
#
df <- data.frame()
transposed.df1 <- data.frame()
allxmldata <- data.frame()
findchildren<-function(nodes, df) {
numchild <- sapply(nodes, function(x){length(xml_children(x))})
xml.value <- xml_text(nodes[numchild==0])
xml.name <- xml_name(nodes[numchild==0])
xml.path <- sapply(nodes[numchild==0], function(x) {gsub(', ','_', toString(rev(xml_name(xml_parents(x)))))})
fieldname <- paste(xml.path,xml.name,sep = '_')
contents <- sapply(xml.value, function(f){is.na(f)<-which(f == '');f})
if (length(fieldname) > 0) {
fieldname <- paste(fieldname,xml.value, sep = '_')
dftemp <- data.frame(fieldname, contents)
df <- rbind(df, dftemp)
print(dim(df))
}
if (sum(numchild)>0){
findchildren(xml_children(nodes[numchild>0]), df) }
else{ return(df)
}
}
findchildren2<-function(nodes, df){
numchild<-sapply(nodes, function(embeddedinputfile){length(xml_children(embeddedinputfile))})
xmlvalue<-xml_text(nodes[numchild==0])
xmlname<-xml_name(nodes[numchild==0])
xmlpath<-sapply(nodes[numchild==0], function(embeddedinputfile) {gsub(', ','_', toString(rev(xml_name(xml_parents(embeddedinputfile)))))})
fieldname<-paste(xmlpath,xmlname,sep = '_')
contents<-sapply(xmlvalue, function(f){is.na(f)<-which(f == '');f})
if (length(fieldname) > 0) {
dftemp<-data.frame(fieldname, contents)
df<-rbind(df, dftemp)
print(dim(df))
}
if (sum(numchild)>0){
findchildren2(xml_children(nodes[numchild>0]), df) }
else{ return(df)
}
}
# Loop all files
#
for (x in inputfiles) {
df1 <- findchildren(xml_children(read_xml(x)),df)
## original xml dataframe
if (length(df1) > 0) {
xml.df1 <- data.frame(spread(df1, key = fieldname, value = contents), fix.empty.names = TRUE)
}
##
xml.df1 %>%
pluck('Response_RawData') -> rawxml
if (length(rawxml)>0) {
df.rawxml <- data.frame(rawxml)
export(df.rawxml,'embedded.xml')
embeddedinputfile <-as.character('embedded.xml')
rm(df1)
df1 <- findchildren2(xml_children(read_xml(embeddedinputfile)),df)
if (length(df1) > 0) {
xml.df2 <- spread(df1, key = fieldname, value = contents)
}
allxmldata <- rbind.fill(allxmldata,cbind(xml.df1,xml.df2))
} else {
allxmldata <- rbind.fill(allxmldata,cbind(xml.df1))
}
}
if(nrow(allxmldata)==nrow(GetNames)) {
alleventdata<-cbind(GetNames,allxmldata)
}
dbConn2 <- odbcDriverConnect('driver={SQL Server};server=PC-XYZ;database=Events;trusted_connection=true')
sqlSave(dbConn2, alleventdata, tablename = 'AE_EventA', append = TRUE )

Applying a function on all csv files from a certain folder

I am reading csv files from a certain folder, which all have the same structure. Furthermore, I have created a function which adds a certain value to a dataFrame.
I have created the "folder reading" - part and also created the function. However, I now need to connect these two with each other. This is where I am having my problems:
Here is my code:
addValue <- function(valueToAdd, df.file, writterPath) {
df.file$result <- df.file$Value + valueToAdd
x <- x + 1
df.file <- as.data.frame(do.call(cbind, df.file))
fullFilePath <- paste(writterPath, x , "myFile.csv", sep="")
write.csv(as.data.frame(df.file), fullFilePath)
}
#1.reading R files
path <- "C:/Users/RFiles/files/"
files <- list.files(path=path, pattern="*.csv")
for(file in files)
{
perpos <- which(strsplit(file, "")[[1]]==".")
assign(
gsub(" ","",substr(file, 1, perpos-1)),
read.csv(paste(path,file,sep="")))
}
#2.appyling function
writterPath <- "C:/Users/RFiles/files/results/"
addValue(2, sys, writterPath)
How to apply the addValue() function in my #1.reading R files construct? Any recommendations?
I appreciate your answers!
UPDATE
When trying out the example code, I get:
+ }
+ ## If you really need to change filenames with numbers,
+ newfname <- file.path(npath, paste0(x, basename(fname)))
+ ## otherwise just use `file.path(npath, basename(fname))`.
+
+ ## (4) Write back to a different file location:
+ write.csv(newdat, file = newfname, row.names = FALSE)
+ }
Error in `$<-.data.frame`(`*tmp*`, "results", value = numeric(0)) :
replacement has 0 rows, data has 11
Any suggestions?

There are several problems with your code (e.g., x in your function is never defined and is not retained between calls to addValue), so I'm guessing that this is a chopped-down version of the real code and you still have remnants remaining. Instead of picking it apart verbosely, I'll just offer my own suggested code and a few pointers.
The function addValue looks like it is good for changing a data.frame, but I would not have guessed (by the name, at least) that it would also write the file to disk (and potentially over-write an existing file).
I'm guessing you are trying to (1) read a file, (2) "add value" to it, (3) assign it to a global variable, and (4) write it to disk. The third can be problematic (and contentious with some programmers), but I'll leave it for now.
Unless writing to disk is inherent to your idea of "adding value" to a data.frame, I recommend you keep #2 separate from #4. Below is a suggested alternative to your code:
addValue <- function(valueToAdd, df) {
df$results <- df$Value + valueToAdd
## more stuff here?
return(df)
}
opath <- 'c:/Users/RFiles/files/raw' # notice the difference
npath <- 'c:/Users/RFiles/files/adjusted'
files <- list.files(path = opath, pattern = '*.csv', full.names = TRUE)
x <- 0
for (fname in files) {
x <- x + 1
## (1) read in and (2) "add value" to it
dat <- read.csv(fname)
newdat <- addValue(2, dat)
## (3) Conditionally assign to a global variable:
varname <- gsub('\\.[^.]*$', '', basename(fname))
if (! exists(varname)) {
assign(x = varname, value = newdat)
} else {
warning('variable exists, did not overwrite: ', varname)
}
## If you really need to change filenames with numbers,
newfname <- file.path(npath, paste0(x, basename(fname)))
## otherwise just use `file.path(npath, basename(fname))`.
## (4) Write back to a different file location:
write.csv(newdat, file = newfname, row.names = FALSE)
}
Notice that it will not overwrite global variables. This may be an annoying check, but will keep you from losing data if you accidentally run this section of code.
An alternative to assigning numerous variables to the global address space is to save all of them to a single list. Assuming they are the same format, you will likely be dealing with them with identical (or very similar) analytical methods, so putting them all in one list will facilitate that. The alternative of tracking disparate variable names can be tiresome.
## addValue as defined previously
opath <- 'c:/Users/RFiles/files/raw'
npath <- 'c:/Users/RFiles/files/adjusted'
ofiles <- list.files(path = opath, pattern = '*.csv', full.names = TRUE)
nfiles <- file.path(npath, basename(ofiles))
dats <- mapply(function(ofname, nfname) {
dat <- read.csv(ofname)
newdat <- addValue(2, dat)
write.csv(newdat, file = nfname, row.names = FALSE)
newdat
}, ofiles, nfiles, SIMPLIFY = FALSE)
length(dats) # number of files
names(dats) # one for each file

R 3.1 sapply to a list of files

I want to parse the read.table() function to a list of .txt files. These files are in my current directory.
my.txt.list <-
list("subject_test.txt", "subject_train.txt", "X_test.txt", "X_train.txt")
Before applying read.table() to elements of this list, I want to check if the dt has not been already computed and is in a cache directory. dt from cache directory are already in my environment(), in form of file_name.dt
R> ls()
"subject_test.dt" "subject_train.dt"
In this example, I only want to compute "X_test.txt" and "X_train.txt". I wrote a small function to test if dt has already been cached and apply read.table()in case not.
my.rt <- function(x,...){
# apply read.table to txt files if data table is not already cached
# x is a character vector
y <- strsplit(x,'.txt')
y <- paste(y,'.dt',sep = '')
if (y %in% ls() == FALSE){
rt <- read.table(x, header = F, sep = "", dec = '.')
}
}
This function works if I take one element this way :
subject_test.dt <- my.rt('subject_test.txt')
Now I want to sapply to my files list this way:
my.res <- saply(my.txt.list,my.rt)
I have my.resas a list of df, but the issue is the function compute all files and does take into account already computed files.
I must be missing something, but I can't see why.
TY for suggestions.

I think it has to do with the use of strsplit in your example. strsplit returns a list.
What about this?
my.txt.files <- c("subject_test.txt", "subject_train.txt", "X_test.txt", "X_train.txt")
> ls()
[1] "subject_test.dt" "subject_train.dt"
my.rt <- function(x){
y <- gsub(".txt", ".dt", x, fixed = T)
if (!(y %in% ls())) {
read.table(x, header = F, sep = "", dec = '.') }
}
my.res <- sapply(my.txt.files, FUN = my.rt)
Note that I'm replacing .txt with .dt and I'm doing a "not in". You will get NULL entries in the result list if a file is not processed.
This is untested, but I think it should work...

Output formatting in R

I am new to R and trying to do some correlation analysis on multiple sets of data. I am able to do the analysis, but I am trying to figure out how I can output the results of my data. I'd like to have output like the following:
NAME,COR1,COR2
....,....,....
....,....,....
If I could write such a file to output, then I can post process it as needed. My processing script looks like this:
run_analysis <- function(logfile, name)
{
preds <- read.table(logfile, header=T, sep=",")
# do something with the data: create some_col, another_col, etc.
result1 <- cor(some_col, another_col)
result1 <- cor(some_col2, another_col2)
# somehow output name,result1,result2 to a CSV file
}
args <- commandArgs(trailingOnly = TRUE)
date <- args[1]
basepath <- args[2]
logbase <- paste(basepath, date, sep="/")
logfile_pattern <- paste( "*", date, "csv", sep=".")
logfiles <- list.files(path=logbase, pattern=logfile_pattern)
for (f in logfiles) {
name = unlist(strsplit(f,"\\."))[1]
logfile = paste(logbase, f, sep="/")
run_analysis(logfile, name)
}
Is there an easy way to create a blank data frame and then add data to it, row by row?

Have you looked at the functions in R for writing data to files? For instance, write.csv. Perhaps something like this:
rs <- data.frame(name = name, COR1 = result1, COR2 = result2)
write.csv(rs,"path/to/file",append = TRUE,...)

I like using the foreach library for this sort of thing:
library(foreach)
run_analysis <- function(logfile, name) {
preds <- read.table(logfile, header=T, sep=",")
# do something with the data: create some_col, another_col, etc.
result1 <- cor(some_col, another_col)
result2 <- cor(some_col2, another_col2)
# Return one row of results.
data.frame(name=name, cor1=result1, cor2=result2)
}
args <- commandArgs(trailingOnly = TRUE)
date <- args[1]
basepath <- args[2]
logbase <- paste(basepath, date, sep="/")
logfile_pattern <- paste( "*", date, "csv", sep=".")
logfiles <- list.files(path=logbase, pattern=logfile_pattern)
## Collect results from run_analysis into a table, by rows.
dat <- foreach (f=logfiles, .combine="rbind") %do% {
name = unlist(strsplit(f,"\\."))[1]
logfile = paste(logbase, f, sep="/")
run_analysis(logfile, name)
}
## Write output.
write.csv(dat, "output.dat", quote=FALSE)
What this does is to generate one row of output on each call to run_analysis, binding them into a single table called dat (the .combine="rbind" part of the call to foreach causes row binding). Then you can just use write.csv to get the output you want.

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Remove path from variable name in a dataframe - r

Regardless of what your set.path is, you can eliminate it by gsub("./","",mypath) mypath<-"/Users/Joh/Desktop/Experiment1 Folder/SCNavigator/Traces/Par994/StepsCrop.ibw" gsub("./","",mypath) [1] "StepsCrop.ibw" `

Related

List elements getting overwritten in for loop R?

Breaking the golden rules on FOR loops with R

Applying a function on all csv files from a certain folder

R 3.1 sapply to a list of files

Output formatting in R

Categories

Resources

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Remove path from variable name in a dataframe - r

Regardless of what your set.path is, you can eliminate it by gsub(".*/","",mypath) mypath<-"/Users/Joh/Desktop/Experiment1 Folder/SCNavigator/Traces/Par994/StepsCrop.ibw" gsub(".*/","",mypath) [1] "StepsCrop.ibw" `

Related

List elements getting overwritten in for loop R?

Breaking the golden rules on FOR loops with R

Applying a function on all csv files from a certain folder

R 3.1 sapply to a list of files

Output formatting in R

Categories

Resources

Regardless of what your set.path is, you can eliminate it by gsub("./","",mypath) mypath<-"/Users/Joh/Desktop/Experiment1 Folder/SCNavigator/Traces/Par994/StepsCrop.ibw" gsub("./","",mypath) [1] "StepsCrop.ibw" `