Extract rows from csv files in R - r

I want to extract lat/long data + file name from csv
I have done the following:
#libraries-----
library(readr)
library("dplyr")
library("tidyverse")
# set wd-----EXAMPLE
setwd("F:/mydata/myfiles/allcsv")
# have R read files as list -----
list <- list.files("F:/mydata/myfiles/allcsv", pattern=NULL, all.files=FALSE,
full.names=FALSE)
list
]
#lapply function
row.names<- c("Date=0", "Time=3", "Type=2", "Model=1", "Coordinates=nextrow", "Latitude = 38.3356", "Longitude = 51.3323")
AllData <- lapply(list, read.table,
skip=5, header=FALSE, sep=";", row.names=row.names, col.names=NULL)
PulledRows <-
lapply(AllData, function(DF)
DF[fileone$Latitude==38.3356, fileone$Longitude==51.3323]
)
# maybe i need to specify a for loop?
how my data looks
Thank you.

This should work for you. You may have to change the path location if the .csv files are not in your working directory. And the location to save the final results.
results <- data.frame(Latitude=NA,Longitude=NA,FileName=NA) #create empty dataframe
for(i in 1:length(list)){ # loop through each file obtained from list (called above)
dat <- read_csv(list[i],col_names = FALSE) # read in the ith dataset
df <- data.frame(dat[6,1],dat[7,1],list[i]) # create new dataframe with values from dat
df[,1] <- as.numeric(str_remove(df[,1],'Latitude=')) # remove text and make numeric
df[,2] <- as.numeric(str_remove(df[,2],'Longitude='))
names(df) <- names(results) # having the same column names allows next line
results <- rbind(results,df) # 'stacks' the results dataframe and df dataframe
}
results <- na.omit(results) # remove missing values (first row)
write_csv(results,'desired/path')

Related

Trouble using mutate within a for loop

I'm trying to write a function called complete that takes a file directory (which has csv files titled 1-332) and the title of the file as a number to print out the number of rows without NA in the sulfate or nitrate columns. I am trying to use mutate to add a column titled nobs which returns 1 if neither column is na and then takes the sum of nobs for my answer, but I get an error message that the object nob is not found. How can I fix this? The specific file directory in question is downloaded within this block of code.
library(tidyverse)
if(!file.exists("rprog-data-specdata.zip")) {
temp <- tempfile()
download.file("https://d396qusza40orc.cloudfront.net/rprog%2Fdata%2Fspecdata.zip",temp)
unzip(temp)
unlink(temp)
}
complete <- function(directory, id = 1:332){
#create a list of files
files_full <- list.files(directory, full.names = TRUE)
#create an empty data frame
dat <- data.frame()
for(i in id){
dat <- rbind(dat, read.csv(files_full[i]))
}
mutate(dat, nob = ifelse(!is.na(dat$sulfate) & !is.na(dat$nitrate), 1, 0))
x <- summarise(dat, sum = sum(nob))
return(x)
}
When one runs the following code nobs should be 117, but I get an error message instead
complete("specdata", 1)
Error: object 'nob' not found"
I think the function below should get what you need. Rather than a loop, I prefer map (or apply) in this setting. It's difficult to say where your code went wrong without the error message or an example I can run on my machine, however.
Happy Coding,
Daniel
library(tidyverse)
complete <- function(directory, id = 1:332){
#create a list of files
files_full <- list.files(directory, full.names = TRUE)
# cycle over each file to get the number of nonmissing rows
purrr::map_int(
files_full,
~ read.csv(.x) %>% # read in datafile
dplyr::select(sulfate, nitrate) %>% # select two columns of interest
tidyr::drop_na %>% # drop missing observations
nrow() # get the number of rows with no missing data
) %>%
sum() # sum the total number of rows not missing among all files
}
As mentioned, avoid building objects in a loop. Instead, consider building a list of data frames from each csv then call rbind once. In fact, even consider base R (i.e., tinyverse) for all your needs:
complete <- function(directory, id = 1:332){
# create a list of files
files_full <- list.files(directory, full.names = TRUE)
# create a list of data frames
df_list <- lapply(files_full[id], read.csv)
# build a single data frame with nob column
dat <- transform(do.call(rbind, df_list),
nob = ifelse(!is.na(sulfate) & !is.na(nitrate), 1, 0)
)
return(sum(dat$nob))
}

Dataframes are created but column names are not changing when reading from excel workbook

I am trying to read an excel workbook in R and for each sheet will create a dataframe.
In the next step, i want to read that created dataframe and use sheet name along with under score before each of the column in the respective dataframe.
Here is what I am doing:
library(readxl)
# Store Sheet Names in a vector
sheet <- excel_sheets("D:/OTC/JULY DATA.XLSX")
# Trim any of the Trailing White Spaces
sheet_trim_trailing <- function (x) sub("\\s+$", "", x)
sheet <- sheet_trim_trailing(sheet)
# Read each of the sheets in the workbook and create a
# dataframe using respective names of the sheets
for(i in 1:length(sheet)){
# this read a sheet and create the dataframe using its name
assign(sheet[i], read.xlsx("DATA.XLSX", sheetIndex = i))
# store dataframe name into a vector
sname <- sheet[i]
# use vector to change the col names in the respective dataframe
colnames(sname) <- gsub("^", paste0(sname,"_"), colnames(sname))
}
Dataframes are created but column names are not changing?
I dont know where I am wrong?
What you need to do is something like
colnames(get(sheet[i])) <- gsub("^", paste0(sname,"_"), colnames(get(sheet[i])))
But this will give an error
target of assignment expands to non-language object
A workaround is to use a temporary variable to change column names
Reproducible example
temp <- mtcars[1:5,]
d <- get("temp")
colnames(d) <- sub("y", " ", colnames(d))
assign("temp", d)
Try this
for(i in 1:length(sheet)){
assign(sheet[i], read.xlsx("DATA.XLSX", sheetIndex = i))
t <- get(sheet[i])
colnames(t) <- gsub("^", paste0(sheet[i],"_"), colnames(t))
assign(sheet[i], t)
}
I think I was looking for something like this one, which does the same as above.
Try This Alternative:
library(readxl)
# function to read all the sheets from excel workbook
read_all_sheets <- function(xlsfile) {
sheets <- excel_sheets(xlsfile)
setNames(lapply(sheets, function(.) {
tbl <- read_excel(xlsfile, sheet = .)
# this will change the col names with sheet name
# and underscore as prefix
names(tbl) <- paste(., names(tbl), sep = "_")
tbl
}), sheets)
}
## create dataframes from sheets
# first read all the sheets are list
List_of_All_Sheets <- read_all_sheets("Location/of/the/file.xlsx")
# then create dataframes
lapply(names(List_of_All_Sheets),
function(nams) assign(nams, List_of_All_Sheets[[nams]],
envir = .GlobalEnv))

How do I output numbered dataframes from each iteration of a loop in R

I am trying to process some files which have gene annotation columns in them. I want to output a dataframe from each iteration of the loop I have so that eventually I can work with each dataframe separately (ideally the dataframe should be called the name of the file it comes from). I have attached what my working function but I don't know how to output named dataframes from each iteration.
library(gridExtra)
#LP6007427.DNA_PolyATClusters<-read.delim("U:\\Batch1\\LP6007427-DNA_PolyATClusters.csv",sep=",")
setwd("U:/Batch1/")
path = "U:\\Batch1\\"
path2 = "U:\\bed_extract\\"
filename <- dir(path, pattern =".csv")
pltList <- list()
for(i in 1:length(filename)){
file <- read.delim(filename[i],header=FALSE,sep=',')
row.names(file)<-NULL
file<-file[-c(1:25),]
write.table(i)
Pusung<-data.frame(file[1],file[2],file[3],file[4],file[8])
names(Pusung)<-c("chr","Start","End","Type","Gene")
library(stringr)
Pusung$chr<-gsub("chr","",Pusung$chr)
Pusung$chr <- factor(as.integer(Pusung$chr), levels=unique(Pusung$chr))
Pusung$Type<-gsub("J","Barr",Pusung$Type)
Pusung$Type<-gsub("T","Tumour",Pusung$Type)
PusungTumour <- as.data.frame(Pusung[grep("Barr", Pusung$Type, invert=TRUE), ])
PusungShared<-as.data.frame(subset(Pusung,grepl('Barr', Type) & grepl('Tumour', Type)))
PusungBarr <- as.data.frame(Pusung[grep("Tumour", Pusung$Type, invert=TRUE), ])
PusungTumour$Type<-gsub('.*',"Tumour",PusungTumour$Type)
PusungTumour[PusungTumour==""]<-NA
Intergenic_Tum<-subset(PusungTumour,!is.na(PusungTumour$Gene))
Intragenic_Tum<-subset(PusungTumour,is.na(PusungTumour$Gene))
PusungBarr$Type<-gsub('.*',"Barr",PusungBarr$Type)
PusungBarr[PusungBarr==""]<-NA
Intergenic_Barr<-subset(PusungBarr,!is.na(PusungBarr$Gene))
Intragenic_Barr<-subset(PusungBarr,is.na(PusungBarr$Gene))
PusungShared$Type<-gsub('.*',"Shared",PusungShared$Type)
PusungShared[PusungShared==""]<-NA
Intergenic_Shared<-subset(PusungShared,!is.na(PusungShared$Gene))
Intragenic_Shared<-subset(PusungShared,is.na(PusungShared$Gene))
PusungBound<-rbind(Intragenic_Barr,Intragenic_Tum)
PusungBound<-rbind(PusungBound,Intragenic_Shared)
}
do.call(grid.arrange, pltList)

applying same function on multiple files in R

I am new to R program and currently working on a set of financial data. Now I got around 10 csv files under my working directory and I want to analyze one of them and apply the same command to the rest of csv files.
Here are all the names of these files: ("US%10y.csv", "UK%10y.csv", "GER%10y.csv","JAP%10y.csv", "CHI%10y.csv", "SWI%10y.csv","SOA%10y.csv", "BRA%10y.csv", "CAN%10y.csv", "AUS%10y.csv")
For example, because the Date column in CSV files are Factor so I need to change them to Date format:
CAN <- read.csv("CAN%10y.csv", header = T, sep = ",")
CAN$Date <- as.character(CAN$Date)
CAN$Date <- as.Date(CAN$Date, format ="%m/%d/%y")
CAN_merge <- merge(all.dates.frame, CAN, all = T)
CAN_merge$Bid.Yield.To.Maturity <- NULL
all.dates.frame is a data frame of 731 consecutive days. I want to merge them so that each file will have the same number of rows which later enables me to combine 10 files together to get a 731 X 11 master data frame.
Surely I can copy and paste this code and change the file name, but is there any simple approach to use apply or for loop to do that ???
Thank you very much for your help.
This should do the trick. Leave a comment if a certain part doesn't work. Wrote this blind without testing.
Get a list of files in your current directory ending in name .csv
L = list.files(".", ".csv")
Loop through each of the name and reads in each file, perform the actions you want to perform, return the data.frame DF_Merge and store them in a list.
O = lapply(L, function(x) {
DF <- read.csv(x, header = T, sep = ",")
DF$Date <- as.character(CAN$Date)
DF$Date <- as.Date(CAN$Date, format ="%m/%d/%y")
DF_Merge <- merge(all.dates.frame, CAN, all = T)
DF_Merge$Bid.Yield.To.Maturity <- NULL
return(DF_Merge)})
Bind all the DF_Merge data.frames into one big data.frame
do.call(rbind, O)
I'm guessing you need some kind of indicator, so this may be useful. Create a indicator column based on the first 3 characters of your file name rep(substring(L, 1, 3), each = 731)
A dplyr solution (though untested since no reproducible example given):
library(dplyr)
file_list <- c("US%10y.csv", "UK%10y.csv", "GER%10y.csv","JAP%10y.csv", "CHI%10y.csv", "SWI%10y.csv","SOA%10y.csv", "BRA%10y.csv", "CAN%10y.csv", "AUS%10y.csv")
can_l <- lapply(
file_list
, read.csv
)
can_l <- lapply(
can_l
, function(df) {
df %>% mutate(Date = as.Date(as.character(Date), format ="%m/%d/%y"))
}
)
# Rows do need to match when column-binding
can_merge <- left_join(
all.dates.frame
, bind_cols(can_l)
)
can_merge <- can_merge %>%
select(-Bid.Yield.To.Maturity)
One possible solution would be to read all the files into R in the form of a list, and then use lapply to to apply a function to all data files. For example:
# Create vector of file names in working direcotry
files <- list.files()
files <- files[grep("csv", files)]
#create empty list
lst <- vector("list", length(files))
#Read files in to list
for(i in 1:length(files)) {
lst[[i]] <- read.csv(files[i])
}
#Apply a function to the list
l <- lapply(lst, function(x) {
x$Date <- as.Date(as.character(x$Date), format = "%m/%d/%y")
return(x)
})
Hope it's helpful.

Loop through data frames and select one column in each of the data frame inside the loop using R

I would like to loop through a data frame and select an individual column from the data frame, for this I use the following code, but it gives me an error. Could someone please guide me what should be corrected in this code?
for (i in 1:3) {
cur_file <- paste(i,".csv",sep="")
curfile <- list.files(pattern = cur_file)
rd_data[i] <- read.csv(curfile, header=F,sep="\t")
col1 <- rd_data[i,1] # select the first column in the "1st" data frame
n_val[i] <- rd_data[i,2] # select the second column in the each of "ith" data frame
}
You can do this without the for loop entirely:
files <- list.files(pattern='*.csv')
dat <- lapply(files, read.csv, header=FALSE, sep='\t') # apply read.csv to each element of files
col_1_list <- lapply(dat, '[', 1) # use the [ function, see ?"[" for more info.
n_val_list <- lapply(dat, '[', 2)
Also, your cod:
col1 <- rd_data[i,1] # select the first column in the "1st" data frame
will select the first column of each data.frame not just the first.

Resources