I have the CSV file like below:
data,key
"VA1,VA2,20140524,,0,0,5969,20140523134902,S7,S1147,140,20140523134902,m/t",4503632376496128
"VA2,VA3,20140711,,0,0,8824,20140601095714,S1,S6402,175,20140601095839,m/t",4503643113914368
I try to read it with R, but I don't need key value and data value should be read to separate columns. With the following code I get almost what I need:
data <- read.csv(fileCSV, header = FALSE, sep = ",", skip = 1, comment.char = "", quote = "")
I skip header line there (skip = 1), say that I don't have it (header = FALSE), and say that I don't have quotes (quote = ""). But in result I get quote characters in V1 and V13 columns and extra V14 column:
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14
1 "VA1 VA2 20140524 NA 0 0 5969 2.014121e+13 S7 S1147 140 2.014121e+13 m/t" 4.503608e+15
Should I delete it somehow after reading csv? Or, is there any better way to read such csv files?
Upd. I use the following approach to delete quotes:
data[,"V1"] = sub("^\"", "", data[,"V1"])
data[,"V13"] = sub("\"$", "", data[,"V13"])
But factor type is changed to character for these columns.
How about a system command with fread()?
writeLines(
'data,key
"VA1,VA2,20140524,,0,0,5969,20140523134902,S7,S1147,140,20140523134902,m/t",4503632376496128
"VA2,VA3,20140711,,0,0,8824,20140601095714,S1,S6402,175,20140601095839,m/t",4503643113914368', "x.txt"
)
require(bit64)
data.table::fread("cat x.txt | rev | cut -d '\"' -f2 | rev | tail -n +2")
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13
# 1: VA1 VA2 20140524 NA 0 0 5969 20140523134902 S7 S1147 140 20140523134902 m/t
# 2: VA2 VA3 20140711 NA 0 0 8824 20140601095714 S1 S6402 175 20140601095839 m/t
Here's a test on the two methods, as requested.
## 150k lines
writeLines(c("data,key\n", rep_len(
'"VA1,VA2,20140524,,0,0,5969,20140523134902,S7,S1147,140,20140523134902,m/t",4503632376496128\n', 1.5e5)),
"test.txt"
)
## fread() in well under 1 second (with bit64 loaded)
system.time({
dt <- data.table::fread(
"cat test.txt | rev | cut -d '\"' -f2 | rev | grep -e '^V'"
)
})
# user system elapsed
# 0.945 0.108 0.547
## your current read.csv() method in just over two seconds
system.time({
df <- read.csv("test.txt", header = FALSE, sep = ",", skip = 1, comment.char = "", quote = "")
df[,"V1"] = sub("^\"", "", df[,"V1"])
df[,"V13"] = sub("\"$", "", df[,"V13"])
})
# user system elapsed
# 2.134 0.000 2.129
dim(dt)
# [1] 150000 13
dim(df)
# [1] 150000 14
Related
I have a data.table with many rows that look like this in R:
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
NCBINCC GenBank gene 331 1008 . - . gene_id=UL1 protein_id=ABV71500.1
NCBINCC GenBank gene 1009 1120 . - . gene_id=UL4 protein_id=ABV71520
NCBINCC GenBank gene 1135 1200 . - . gene_id=UL6 protein_id=ABV71525
Is there a simple way to add quotes in between strings (after the strings gene_id= and protein_id=) so that they only encompass the different gene and proteins like the following output:
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
NCBINCC GenBank gene 331 1008 . - . gene_id="UL1" protein_id="ABV71500.1"
NCBINCC GenBank gene 1009 1120 . - . gene_id="UL4" protein_id="ABV71520"
NCBINCC GenBank gene 1135 1200 . - . gene_id="UL6" protein_id="ABV71525"
I have seen this answer for shell, but wanted to know if there was a way to also do it in R. Thank you kindly.
We can use str_replace with a regex lookaround to match the =, capture the alphanumeric characters including the . and replace with the backreference (\\1) quoted
library(stringr)
library(dplyr)
df1 <- df1 %>%
mutate(across(c(V9, V10),
~ str_replace(., "(?<=\\=)([[:alnum:].]+)", '"\\1"')))
-output
df1
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
#1 NCBINCC GenBank gene 331 1008 . - . gene_id="UL1" protein_id="ABV71500.1"
#2 NCBINCC GenBank gene 1009 1120 . - . gene_id="UL4" protein_id="ABV71520"
#3 NCBINCC GenBank gene 1135 1200 . - . gene_id="UL6" protein_id="ABV71525"
Forgot to use the corresponding option using base R
nm1 <- c("V9", "V10")
df1[nm1] <- lapply(df1[nm1], function(x)
sub("(?<=\\=)([[:alnum:].]+)", '"\\1"', x, perl = TRUE))
data
df1 <- structure(list(V1 = c("NCBINCC", "NCBINCC", "NCBINCC"), V2 = c("GenBank",
"GenBank", "GenBank"), V3 = c("gene", "gene", "gene"), V4 = c(331L,
1009L, 1135L), V5 = c(1008L, 1120L, 1200L), V6 = c(".", ".",
"."), V7 = c("-", "-", "-"), V8 = c(".", ".", "."), V9 = c("gene_id=UL1",
"gene_id=UL4", "gene_id=UL6"), V10 = c("protein_id=ABV71500.1",
"protein_id=ABV71520", "protein_id=ABV71525")), class = "data.frame",
row.names = c(NA,
-3L))
If you are bored from packages, you may want to try sub in an lapply.
v <- c('V9', 'V10')
d[v] <- lapply(d[v], sub, pa='\\=(.*)', re='="\\1"')
d
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
# 1 NCBINCC GenBank gene 331 1008 . - . gene_id="UL1" protein_id="ABV71500.1"
# 2 NCBINCC GenBank gene 1009 1120 . - . gene_id="UL4" protein_id="ABV71520"
# 3 NCBINCC GenBank gene 1135 1200 . - . gene_id="UL6" protein_id="ABV71525"
Data
d <- read.table(header=T, text='V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
NCBINCC GenBank gene 331 1008 . - . gene_id=UL1 protein_id=ABV71500.1
NCBINCC GenBank gene 1009 1120 . - . gene_id=UL4 protein_id=ABV71520
NCBINCC GenBank gene 1135 1200 . - . gene_id=UL6 protein_id=ABV71525')
I would use mutate and stringr:
require(dplyr)
require(stringr)
myTable %>%
mutate(across(c(V9, V10),
function(x){
firstHalf <- str_extract(x, "^.+=") # everything up to and including the '='
secondHalf <- str_extract(x, "(?<==).*$") # everything after the '='
# Add quotes to secondHalf
newSecondHalf <- paste0("\"", secondHalf, "\"")
# Glue it all back together and spit it out
paste0(firstHalf, newSecondHalf)
}))
Assuming a data table named mydatatable, i used gsub and paste0.
library(dplyr)
mydatatable <- mydatatable %>%
mutate(across(c(V9, V10), ~paste0(gsub("=", '="', .), '"')))
Although I can't imagine it not having been asked before in one way or another, I don't seem to be able to find something that answers my question.
I have data that looks like this
> mydata1
V1
,10.00,20.00,30.00,40.00
,11.00,22.00,33.00,44.00
And I'd like to have data that looks like:
> mydata2
V1 V2 V3 V4
10.00 20.00 30.00 40.00
11.00 22.00 33.00 44.00
When I try read.table and separation with "," I get:
> mydata2 <- read.table(> mydata1, sep = ",")
Error in read.table(mydata1, sep = ",") :
'file' must be a character string or connection
I tried some Regex magic, but this didn't work (mostly because I have no deep understanding in the matter).
Any help is much appreciated!
We can use read.csv after removing the , at the start of the string with sub
mydata2 <- read.csv(text = sub("^,", "", mydata1$V1), header = FALSE)
mydata2
# V1 V2 V3 V4
#1 10 20 30 40
#2 11 22 33 44
library(tidyverse)
separate(mydata1, V1, into = c("V0", "V1", "V2", "V3", "V4"), sep = ",") %>% select(-V0)
# V1 V2 V3 V4
# 1 10.00 20.00 30.00 40.00
# 2 11.00 22.00 33.00 44.00
I wanna skip the first three columns. Couldn't quite understand the posts about colClasses because I'm new to R.
YDL025C YDL025C 1 -0.1725 -0.5375 -0.4970 -0.3818 -0.5270 -0.4260 -0.6929 -0.4020 -0.3263 -0.3373 -0.3532 -0.2771 -0.2732 -0.3307 -0.4660 -0.4314 -0.3135
YKL032C YKL032C 1 -0.2364 0.0794 0.1678 0.2389 0.3847 0.2625 0.1889 0.2681 0.0363 -0.1992 -0.0521 -0.0307 0.0584 0.2817 0.2239 -0.0253 0.0751
If you have to use read.table and you want to filter on the way in, you can use col.classes as follows. You have 20 columns. Say the first 2 are character, rest are numeric and you want to drop 4,5,6. You construct a vector of length 20 detailing that information. The NULL will not pull in those columns.
x<- read.table(file="datat.txt",
colClasses = c(rep("character", 2),
rep("numeric", 1),
rep("NULL", 3),
rep("numeric", 14)),
header = FALSE)
x
V1 V2 V3 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20
1 YDL025C YDL025C 1 -0.3818 -0.5270 -0.4260 -0.6929 -0.4020 -0.3263 -0.3373 -0.3532 -0.2771 -0.2732 -0.3307 -0.4660 -0.4314 -0.3135
2 YKL032C YKL032C 1 0.2389 0.3847 0.2625 0.1889 0.2681 0.0363 -0.1992 -0.0521 -0.0307 0.0584 0.2817 0.2239 -0.0253 0.0751
As commented above, easier to remove the columns after reading in. For example:
mydf <- read.table("mydf.txt")
Then,
mydf[, 4:ncol(mydf)]
will remove the first 3 columns.
Lake Elsinore 9.7 F W 60.2 131 1 1 0 2310.1
Lake Elsinore 10.4 F W 53.9 67 0 0 0 1815.9
Lake Elsinore 10.1 M W 54.3 96 1 1 1 1872.9
Lake Elsinore 9.6 M W 55.1 72 1 . 1 1980.4
So here I have ten variables V1-V10. How can I read it to R. You see the first variable is actually separated by space. So I can't read in "separating by space". Could someone have me to find a way that I could easily import those kind of data in.
Thank you so so much!
Here are two approaches:
1) It could be done with read.pattern in the gsubfn package. The matches to the parenthesized portions of the pattern are read in as separate fields:
library(gsubfn)
pattern <- "^(.*) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+)"
read.pattern("myfile.dat", pattern, na.strings = ".")
giving:
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
1 Lake Elsinore 9.7 F W 60.2 131 1 1 0 2310.1
2 Lake Elsinore 10.4 F W 53.9 67 0 0 0 1815.9
3 Lake Elsinore 10.1 M W 54.3 96 1 1 1 1872.9
4 Lake Elsinore 9.6 M W 55.1 72 1 NA 1 1980.4
2) Read in the lines as they are, replace the first space on each line with some character (here we use underscore), re-read it now using read.table and then replace the underscore with space:
L <- readLines("myfile.dat")
L <- sub(" ", "_", L)
DF <- read.table(text = L, na.strings = ".")
DF[[1]] <- sub("_", " ", DF[[1]])
giving the same answer.
It's a little clunky, but I usually just read it in raw and parse the data from there. You could do something like:
# First, read in all columns space separated
df <- read.table(FILE, header = F, sep = " ")
# Create a new column (V12) that's a concatenation of V1, V2
within(df, V12 <- paste(V1, V2, sep=' '))
# And then drop the unwanted columns
df <- df[,2:11]
Remember, you have 11 columns reading it in raw, which is why I'm creating a 12th.
I have a csv file i'm importing but some columns are not getting the correct formatting. It's very strange and I can't figure it out. The entire top row is formatting the columns as characters, instead of numeric. Believe it is getting the formatting from V1/time1 column?
> dde = read.csv("dde.csv",header=F, sep=",",stringsAsFactors=FALSE)
> dde <- na.omit(dde)
> dde <- as.data.frame(dde)
> time1 = as.POSIXct(strptime(paste(dde$V1, sep=" "),format="%m/%d/%Y %I:%M:%S %p"))
> head(dde)
V1 V2 V3 V4 V5 V6 V7 V8
1 9/7/2014 9:20:00 PM 105.061 136.099 169.961 98.391 96.515 112.802 87.277
2 9/7/2014 9:26:00 PM 105.068 136.074 169.954 98.399 96.521 112.790 87.276
3 9/7/2014 9:31:00 PM 105.078 136.107 170.031 98.414 96.528 112.813 87.287
4 9/7/2014 9:35:00 PM 105.068 136.102 170.001 98.424 96.516 112.789 87.289
5 9/7/2014 9:41:00 PM 105.074 136.109 169.994 98.422 96.519 112.821 87.300
6 9/7/2014 9:45:00 PM 105.091 136.114 170.028 98.420 96.539 112.829 87.302
V9 V10 V11 V12 V13 V14 V15 V16 V17
1 1.29531 0.80054 1.38283 1.40974 1.20601 1.55867 1.61761 0.93644 1.08825
2 1.29503 0.80041 1.38256 1.40949 1.20607 1.55817 1.61749 0.93643 1.08828
3 1.29514 0.80026 1.38256 1.40963 1.20607 1.55828 1.61796 0.93650 1.08832
4 1.29520 0.80038 1.38250 1.40957 1.20594 1.55819 1.61791 0.93666 1.08835
5 1.29517 0.80042 1.38259 1.40965 1.20590 1.55843 1.61777 0.93658 1.08840
6 1.29519 0.80046 1.38275 1.40969 1.20588 1.55860 1.61780 0.93648 1.08834
V18 V19 V20 V21 V22 V23 V24 V25 V26
1 0.93103 0.83073 1.72682 1.77693 1.50608 1.94649 1.01918 0.87190 1.12698
2 0.93106 0.83075 1.72689 1.77693 1.50593 1.94627 1.01912 0.87187 1.12676
3 0.93109 0.83069 1.72704 1.77693 1.50638 1.94661 1.01929 0.87202 1.12684
4 0.93110 0.83082 1.72687 1.77693 1.50645 1.94631 1.01941 0.87213 1.12694
5 0.93101 0.83080 1.72681 1.77693 1.50613 1.94643 1.01934 0.87199 1.12701
6 0.93097 0.83070 1.72706 1.77693 1.50613 1.94680 1.01927 0.87190 1.12696
V27 V28 V29 V30
1 0.85511 0.90400 0.77324 1268.81
2 0.85520 0.90390 0.77332 1268.81
3 0.85517 0.90405 0.77328 1268.81
4 0.85515 0.90415 0.77333 1268.81
5 0.85508 0.90423 0.77344 1268.81
6 0.85513 0.90412 0.77334 1268.81
> V22 = xts(dde$V22, order.by=time1)
> V22 <-to.minutes(V22[,1],240,'minutes')
> V22 <- align.time(xts(V22),5 * 60)
>
> V2 = xts(dde$V2, order.by=time1)
> V2 <-to.minutes(V2[,1],240,'minutes')
Error in to.period(x, "minutes", k = k, name = name, ...) :
unsupported type
> V2 <- align.time(xts(V2),5 * 60)
>
> class(dde$V22)
[1] "numeric"
> class(dde$V2)
[1] "character"
> typeof(dde$V22)
[1] "double"
> typeof(dde$V2)
[1] "character"
When I do the things you said you did in the comments, it works for me
dde <- read.csv("dde.csv", header=FALSE, stringsAsFactors=FALSE)
dde <- na.omit(dde)
colnames(dde)[1] <- "time"
colnames(dde)[2] <- "test1"
time1 = as.POSIXct(strptime(paste(dde$time, sep=" "),format="%m/%d/%Y %I:%M:%S %p"))
test1 = xts(dde[, c("test1")], order.by=time1)
test1 <- to.minutes(test1[,1],240,'minutes')
test1 <- align.time(xts(test1),5 * 60)
test1
# minutes.Open minutes.High minutes.Low minutes.Close
#2014-09-07 21:30:00 105.061 105.068 105.061 105.068