Im reading a text file containing data with "problematic" line. The last line that starts with *NOTE has to be removed (number of rows in the text file is not always the same):
ColumnA ColumnB ColumnC
A2 17 14
B2 20 -1
C2 21 36
*NOTE: -1 = data do not exist
This is my line to read the text file (i have to select text file since its location not constant:
my_data <- read.delim(file.choose(), header = TRUE, sep = "", quote = "",
dec = ".", fill = TRUE, comment.char = "")
I have tried :
my_data[- grep("*NOTE:", my_data$ColumnA),]
But it does not seem to work.
Any simple solutions to this?
You could call read.delim with comment.char = "*":
my_data <- read.delim(file.choose(), header = TRUE, sep = "", quote = "",
dec = ".", fill = TRUE, comment.char = "*")
This will remove the final line when you are reading it in because it starts with *.
Another option is fread from data.table. fread has a fancy autostart feature which automagically drops lines without the expected number of columns:
library(data.table)
fread(file.choose())
There is another way to handle this, which is to write a short function that takes the regexes that you want to filter out. You can feed it the file name, but if this is missing it will give you the file dialogue:
read_broken <- function(file_path, filter_out = "^[*]NOTE:")
{
if(missing(file_path)) file_path <- file.choose()
x <- suppressWarnings(readLines(file_path))
x <- x[nzchar(x)]
x <- x[!apply(sapply(filter_out, grepl, x), 1, any)]
read.delim(text = x, header = TRUE, sep = "", quote = "", dec = ".", fill = TRUE)
}
So you can do:
read_broken("myfile.txt")
#> ColumnA ColumnB ColumnC
#> 1 A2 17 14
#> 2 B2 20 -1
#> 3 C2 21 36
Or
read_broken("myfile.txt", filter_out = c("^[*]NOTE:", "A2"))
#> ColumnA ColumnB ColumnC
#> 1 B2 20 -1
#> 2 C2 21 36
Related
I'm still learning my way around Regex, help much appreciated. I'm trying to extract the string from beginning of file name, aswell as last two characters from inside the square brackets of "File" below to generate "Image" and "ID" variables by mutate shown in data.out.
data<- data.frame("File"= c("TA1317_Scan3_Core[1,2,A]_[7473,42737]_component_data",
"TA 2654_Scan1_Core[1,3,A]_[6700,36673]_component_data"))
data.out<- data %>% data.frame("Image"= c("TA1317", "TA2654"), "ID" = c("2A", "3A"))
File Image ID
1 TA1317_Scan3_Core[1,2,A]_[7473,42737]_component_data TA1317 2A
2 TA 2654_Scan1_Core[1,3,A]_[6700,36673]_component_data TA2654 3A
Another alternative is strcapture, with only one regex pattern instead of two:
out <- strcapture("^([^_]*).*?\\[[^,]*,([^,]*,[^,*])\\].*", data$File, list(Image = "", ID = ""))
out$ID <- gsub(",", "", out$ID, fixed = TRUE)
out
# Image ID
# 1 TA1317 2A
# 2 TA 2654 3A
cbind(data, out)
# File Image ID
# 1 TA1317_Scan3_Core[1,2,A]_[7473,42737]_component_data TA1317 2A
# 2 TA 2654_Scan1_Core[1,3,A]_[6700,36673]_component_data TA 2654 3A
Within a dplyr pipe, you can still use it:
library(dplyr)
data %>%
bind_cols(strcapture("^([^_]*).*?\\[[^,]*,([^,]*,[^,*])\\].*", .$File, list(Image = "", ID = ""))) %>%
mutate(ID = gsub(",", "", ID, fixed = TRUE))
# File Image ID
# 1 TA1317_Scan3_Core[1,2,A]_[7473,42737]_component_data TA1317 2A
# 2 TA 2654_Scan1_Core[1,3,A]_[6700,36673]_component_data TA 2654 3A
You can try :
transform(data, Image = sub('([A-Z0-9\\s]+)_.*', '\\1', File),
ID = sub('.*\\[.*(\\d+),([A-Z])\\].*', '\\1\\2', File))
# File Image ID
#1 TA1317_Scan3_Core[1,2,A]_[7473,42737]_component_data TA1317 2A
#2 TA 2654_Scan1_Core[1,3,A]_[6700,36673]_component_data TA 2654 3A
where Image captures one or more occurrence of A-Z, 0-9 or whitespace.
and ID consists of a number followed by a comma and a letter between square brackets.
I have CSV data with backtick (`)as a string encloser and yen symbol (¥) as an escape character.
Example :
I tried reading the raw file and replaced yen symbol with a backslash but not working.
fl <- readLines("data.csv", encoding = "UTF-8")
fl2 <- gsub('¥', "\\", fl)
writeLines(fl2, "Edited_data.txt")
sms_data <- fread("Edited_data.txt", sep = ",", stringsAsFactors = FALSE, quote = "\`", dec = ".", encoding = "UTF-8")
Expected Dataframe
I couldn't access your data since it's an image but here's a version with readr:
library(readr)
dt <- "Sentence, Value1, Value2\n`This is the first row`, 0, 0\n`This , this is something else with a comma¥`, 0, 0"
# We can read for your data, respect your strings within `` and read the the `¥` symbol.
dt_read <- read_csv(dt, quote = "`")
dt_read
#> # A tibble: 2 x 3
#> Sentence Value1 Value2
#> <chr> <dbl> <dbl>
#> 1 This is the first row 0 0
#> 2 This , this is something else with a comma¥ 0 0
# Then, we just replace that symbol with nothing
dt_read$Sentence <- gsub("¥", "", dt_read$Sentence)
dt_read
#> # A tibble: 2 x 3
#> Sentence Value1 Value2
#> <chr> <dbl> <dbl>
#> 1 This is the first row 0 0
#> 2 This , this is something else with a comma 0 0
You can change the escape sequence to whatever you like and change it back once you read the text in. I have reproduced your data here:
yen <- c("Sentence,Value1,Value2",
"`ML Taper, Triology TM`,0,0",
"90481 3TBS/¥`10TRYS/1SR PAUL/JOE,0,0",
"`D/3,E/4`,0,0")
writeLines(yen, path.expand("~/yen.csv"))
Now the code
library(data.table)
# Read data without specifying encoding to handle ANSI or UTF8 yens
fl <- readLines(path.expand("~/yen.csv"))
# The yen symbol is 0xc2 0xa5 in UTF8, so we want it encoded this way
utf8_yen <- rawToChar(as.raw(c(0xc2, 0xa5)))
ansi_yen <- rawToChar(as.raw(0xa5))
fl <- gsub(utf8_yen, ansi_yen, fl)
# Paste on our backtick to get the backtick escape
yen_tick <- paste0(ansi_yen, "`")
# Change the backtick escape then remove all yen nsymbols
fl2 <- gsub(yen_tick, "&backtick;", fl)
fl2 <- gsub(ansi_yen, "", fl2)
# Save our modified string and reload it as a dataframe
writeLines(fl2, path.expand("~/Edited_data.txt"))
sms_data <- fread(path.expand("~/Edited_data.txt"),
sep = ",", stringsAsFactors = FALSE, quote = "\`", dec = ".")
# Now we can unescape our backticks and we're done
sms_data$Sentence <- gsub("&backtick;", "`", sms_data$Sentence)
So now we have
sms_data
#> Sentence Value1 Value2
#> 1: ML Taper, Triology TM 0 0
#> 2: 90481 3TBS/`10TRYS/1SR PAUL/JOE 0 0
#> 3: D/3,E/4 0 0
I have a column called 'WFBS' that has over a million rows of strings of different lengths that look like this:
WFBS <- c("M010203", "S01020304", "N104509")
and I need an output that looks like this:
WFBS1 <- c("M01", "S01", "N10")
WFBS2 <- c("02", "02", "45")
WFBS3 <- c("03", "03", "09")
WFBS4 <- c(NA, "04", NA)
So I need to separate each string in:
first column: 3 characters (ie the letter followed by 2 digits)
rest of the columns: 2 characters per column until I have no characters left
I tried using the function strsplit, but it says that my variables are not characters, so then I created a vector x as follows:
x <- as.character(WFBS)
but then I don't know how to separate the string into columns with the function strsplit.
An option with base R bu creating a delimiter , using sub, read with read.csv to create a 4 column data.frame
read.csv(text = sub("^(...)(..)(..)(.*)", "\\1,\\2,\\3,\\4", WFBS),
header = FALSE, colClasses = rep("character", 4), na.strings = "",
col.names =paste0("WFBS", 1:4), stringsAsFactors = FALSE)
# WFBS1 WFBS2 WFBS3 WFBS4
#1 M01 02 03 <NA>
#2 S01 02 03 04
#3 N10 45 09 <NA>
This might be a useful starting point:
library(tidyr)
df <- data.frame(WFBS = c("M010203", "S01020304", "N104509"),
stringsAsFactors = FALSE)
> df %>% separate(col = WFBS,
into = c("WFBS1","WFBS2","WFBS3","WFBS4"),
sep = c(3,5,7))
WFBS1 WFBS2 WFBS3 WFBS4
1 M01 02 03
2 S01 02 03 04
3 N10 45 09
This leaves you with empty strings rather than NAs in the remainder spots, which you'd have to convert.
I am trying to create an input file for another program that is space-delimited. I'm pasting together the contents of multiple columns and having problems when the number have different lengths due to what appears to be a default right-justify in R. For example:
row_id monthly_spend
123 4.55
567 24.64
678 123.09
becomes :
row_id:123 monthly_spend: 4.55
row_id:567 monthly_spend: 24.64
row_id:678 monthly_spend:123.09
while what I need is this:
row_id:123 monthly_spend:4.55
row_id:567 monthly_spend:24.64
row_id:678 monthly_spend:123.09
the code I'm using is derived from this question here and looks like this:
paste(row_id, monthly_spend, sep=":", collapse=" ")
i've tried formatting the columns as numeric or integer without any change.
Any suggestions?
if you put your vectors into a data.frame (if they are not already)
you can use:
apply(sapply(names(myDF), function(x)
paste(x, myDF[, x], sep=":") ), 1, paste, collapse=" ")
# [1] "row_id:123 monthly_spend:4.55"
# [2] "row_id:567 monthly_spend:24.64"
# [3] "row_id:678 monthly_spend:123.09"
or alternatively:
do.call(paste, lapply(names(myDF), function(x) paste0(x, ":", myDF[, x])))
sprintf is also an option. You've got many ways of going about it
sample data used:
myDF <- read.table(header=TRUE, text=
"row_id monthly_spend
123 4.55
567 24.64
678 123.09")
With your data snippet:
df <- read.table(text = "row_id monthly_spend
123 4.55
567 24.64
678 123.09", header = TRUE)
The we can paste together but employ the format function with trim = TRUE to take care of stripping the spaces you don't want:
with(df, paste("row_id:", row_id,
"monthly_spend:", format(monthly_spend, trim = TRUE)))
Which gives:
> with(df, paste("row_id:", row_id,
+ "monthly_spend:", format(monthly_spend, trim = TRUE)))
[1] "row_id: 123 monthly_spend: 4.55" "row_id: 567 monthly_spend: 24.64"
[3] "row_id: 678 monthly_spend: 123.09"
If you need this in a data frame before writing out to file, use:
newdf <- with(df, data.frame(foo = paste("row_id:", row_id,
"monthly_spend:",
format(monthly_spend, trim = TRUE))))
newdf
> newdf
foo
1 row_id: 123 monthly_spend: 4.55
2 row_id: 567 monthly_spend: 24.64
3 row_id: 678 monthly_spend: 123.09
When you write this out, the columns will be justified as you want.
Here is a general answer (any number of variables), assuming your data is in a data.frame dat:
x <- mapply(names(dat), dat, FUN = paste, sep = ":")
write.table(x, file = stdout(),
quote = FALSE, row.names = FALSE, col.names = FALSE)
And you can replace stdout() with a filename.
assuming the data frame is called df
write.table(as.data.frame(sapply(1:ncol(df),FUN=function(x)paste(rep(colnames(df)[x],nrow(df)),df[,x],sep=":"))),"someFileName",row.names=FALSE,col.names=FALSE,sep=" ");
equivalent to following substeps:
# generating the column separated records
df_cp<-sapply(1:ncol(df),FUN=function(x)paste(rep(colnames(df)[x],nrow(df)),df[,x],sep=":"));
### casting to data frame
df_cp<-as.data.frame(df_cp);
### writing out to disk
write.table(df_cp,"someFileName",row.names=FALSE,col.names=FALSE,sep=" ");
Very simple question. I am using an excel sheet that has two rows for the column headings; how can I convert these two row headings into one? Further, these headings don't start at the top of the sheet.
Thus, I have DF1
Temp Press Reagent Yield A Conversion etc
degC bar /g % %
1 2 3 4 5
6 7 8 9 10
and I want,
Temp degC Press bar Reagent /g Yield A % Conversion etc
1 2 3 4 5
6 7 8 9 10
Using colnames(DF1) returns the upper names, but getting the second line to merge with the upper one keeps eluding me.
Using your data, modified to quote text fields that contain the separator (get whatever tool you used to generate the file to quote text fields for you!)
txt <- "Temp Press Reagent 'Yield A' 'Conversion etc'
degC bar /g % %
1 2 3 4 5
6 7 8 9 10
"
this snippet of code below reads the file in two steps
First we read the data, so skip = 2 means skip the first 2 lines
Next we read the data again but only the first two line, this output is then further processed by sapply() where we paste(x, collapse = " ") the strings in the columns of the labs data frame. These are assigned to the names of dat
Here is the code:
dat <- read.table(text = txt, skip = 2)
labs <- read.table(text = txt, nrows = 2, stringsAsFactors = FALSE)
names(dat) <- sapply(labs, paste, collapse = " ")
dat
names(dat)
The code, when runs produces:
> dat <- read.table(text = txt, skip = 2)
> labs <- read.table(text = txt, nrows = 2, stringsAsFactors = FALSE)
> names(dat) <- sapply(labs, paste, collapse = " ")
>
> dat
Temp degC Press bar Reagent /g Yield A % Conversion etc %
1 1 2 3 4 5
2 6 7 8 9 10
> names(dat)
[1] "Temp degC" "Press bar" "Reagent /g"
[4] "Yield A %" "Conversion etc %"
In your case, you'll want to modify the read.table() calls to point at the file on your file system, so use file = "foo.txt" in place of text = txt in the code chunk, where "foo.txt" is the name of your file.
Also, if these headings don't start at the top of the file, then increase skip to 2+n where n is the number of lines before the two header rows. You'll also need to add skip = n to the second read.table() call which generates labs, where n is again the number of lines before the header lines.
This should work. You only need set stringsAsFactors=FALSE when reading data.
data <- structure(list(Temp = c("degC", "1", "6"), Press = c("bar", "2",
"7"), Reagent = c("/g", "3", "8"), Yield.A = c("%", "4", "9"),
Conversion = c("%", "5", "10")), .Names = c("Temp", "Press",
"Reagent", "Yield.A", "Conversion"), class = "data.frame", row.names = c(NA,
-3L)) # Your data
colnames(data) <-paste(colnames(dados),dados[1,]) # Set new names
data <- data[-1,] # Remove first line
data <- data.frame(apply(data,2,as.real)) # Correct the classes (works only if all collums are numbers)
Just load your file with read.table(file, header = FALSE, stringsAsFactors = F) arguments. Then, you can grep to find the position this happens.
df <- data.frame(V1=c(sample(10), "Temp", "degC"),
V2=c(sample(10), "Press", "bar"),
V3 = c(sample(10), "Reagent", "/g"),
V4 = c(sample(10), "Yield_A", "%"),
V5 = c(sample(10), "Conversion", "%"),
stringsAsFactors=F)
idx <- unique(c(grep("Temp", df$V1), grep("degC", df$V1)))
df2 <- df[-(idx), ]
names(df2) <- sapply(df[idx, ], function(x) paste(x, collapse=" "))
Here, if you want, you can then convert all the columns to numeric as follows:
df2 <- as.data.frame(sapply(df2, as.numeric))