Reading a file in R space separated, skip first lines - r

I'd like to read a file in R into a matrix of M by N.
The file is of the following form:
# /n/home11/tros/sar/tests/mars/abro 250
# /n/home11/tros/sar/tests/mars/abro 230
# /n/home11/tros/sar/tests/mars/abro 20
# /n/home11/tros/sar/tests/mars/abro 20
# T (M rows,N cols)
# M 3
# N 4
7.947363550e+03 1.066183995e+04 3.896434554e+03 8.319875735e+03
1.600281531e+04 1.991086422e+04 1.628421819e+03 1.239507171e+04
7.430547003e+03 2.349262184e+03 4.883555574e+03 4.986597752e+02
The first lines (all lines with # sign) should be skipped, but M and N could (potentially) be read from the header (lines with #) lines.
Then a numeric matrix of dimensions M by N (3 by 4 in this case) should be read, note that the separator is just space (NOT tabs).
Thanks.

read.table will skip lines starting with # by default:
s <- "# /n/home11/tros/sar/tests/mars/abro 250
# /n/home11/tros/sar/tests/mars/abro 230
# /n/home11/tros/sar/tests/mars/abro 20
# /n/home11/tros/sar/tests/mars/abro 20
# T (M rows,N cols)
# M 3
# N 4
7.947363550e+03 1.066183995e+04 3.896434554e+03 8.319875735e+03
1.600281531e+04 1.991086422e+04 1.628421819e+03 1.239507171e+04
7.430547003e+03 2.349262184e+03 4.883555574e+03 4.986597752e+02
"
read.table(header=FALSE, text=s)
## V1 V2 V3 V4
## 1 7947.364 10661.840 3896.435 8319.8757
## 2 16002.815 19910.864 1628.422 12395.0717
## 3 7430.547 2349.262 4883.556 498.6598
Rather than using text= you will probably want to use file= and supply a file name from which to read the data.

Lines <- readLines(s)
M <- as.numeric( sub("^#\\sM" ,"" , Lines[grep("^#\\sM",Lines)]) )
M
#[1] 3
N <- as.numeric( sub("^#\\sN" ,"" , Lines[grep("^#\\sN",Lines)]) )
dat <- read.table(text=Lines[grep("^[^#]",Lines)])
dat
V1 V2 V3 V4
1 7947.364 10661.840 3896.435 8319.8757
2 16002.815 19910.864 1628.422 12395.0717
3 7430.547 2349.262 4883.556 498.6598

Related

r padding strings to same length

Following hours of searching for what should be simple I need help.
What I want to do:
Ensure that all strings are padded to the same length of 26 characters in length.
Dataset:
library(stringr)
names <-
structure(list(
names = c(
"A",
"ABC",
"ABCDEFG",
"ABCDEFGHIJKLMNOP",
"AB",
"ABCDEFGHI",
"ABCDEFGHIJKLMNOPQRSTUVWXYZ",
"ABCDEFGHIJKL",
"ABCDEFGHIJKLMNOPQR",
"ABCDEFGHIJKLMNOP",
"ABCDEFGHIJKLMNO"
)
),
class = "data.frame",
row.names = c(NA,-11L))
Step 1:
Find max character length and the number of spaces to pad:
max <- as.numeric(max(nchar(names$names)))
max
n <- as.numeric(nchar(names$names))
n
pad <- max - n
pad
#add columns to the dataset to check how many characters are to be padded for each name
names$max <- as.numeric(max(nchar(names$names)))
names$n <- as.numeric(nchar(names$names))
names$pad <- as.numeric(max - n)
Step 2: Pad
names$names <-
str_pad(names$names,
pad,
side = "right",
pad = "0")
But this approach doesn't appear to be working for me. Can someone point me in the right direction? I am getting different length strings:
names max n pad
1 A000000000000000000000000 26 1 25
2 ABC00000000000000000000 26 3 23
3 ABCDEFG000000000000 26 7 19
4 ABCDEFGHIJKLMNOP 26 16 10
5 AB0000000000000000000000 26 2 24
6 ABCDEFGHI00000000 26 9 17
7 ABCDEFGHIJKLMNOPQRSTUVWXYZ 26 26 0
8 ABCDEFGHIJKL00 26 12 14
9 ABCDEFGHIJKLMNOPQR 26 18 8
10 ABCDEFGHIJKLMNOP 26 16 10
11 ABCDEFGHIJKLMNO 26 15 11
Help would be greatly appreciated.
Here we need just
library(dplyr)
mx <- as.numeric(max(nchar(names$Name)))
names$Name <- str_pad(names$Name, mx, side = "right", pad = "0")
names$Name
-output
#[1] "A0000000000000000000000000" "ABC00000000000000000000000" "ABCDEFG0000000000000000000" "ABCDEFGHIJKLMNOP0000000000"
#[5] "AB000000000000000000000000" "ABCDEFGHI00000000000000000" "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "ABCDEFGHIJKL00000000000000"
#[9] "ABCDEFGHIJKLMNOPQR00000000" "ABCDEFGHIJKLMNOP0000000000" "ABCDEFGHIJKLMNO00000000000"
NOTE: It is better not to name objects with names that are either function names or argument names
I think you want the format function. You set the width and then justify left, right or center:
format(names, width = 26, justify = "left")
# Name
# 1 A
# 2 ABC
# 3 ABCDEFG
# 4 ABCDEFGHIJKLMNOP
# 5 AB
# 6 ABCDEFGHI
# 7 ABCDEFGHIJKLMNOPQRSTUVWXYZ
# 8 ABCDEFGHIJKL
# 9 ABCDEFGHIJKLMNOPQR
# 10 ABCDEFGHIJKLMNOP
# 11 ABCDEFGHIJKLMNO
Using rep and paste(..., collapse="") (kind of pythong's join for vec of strings) and Vectorize() and closing-over pad (meaning just grapping pad from argument list) one can quickly create a pad-string generator reps.
Using paste0 one can element-wise join the character vectors.
pad_strings <- function(char_vec, max_len=NULL, pad="0") {
reps <- Vectorize(function(n) paste(rep(pad, n), collapse=""))
lengths <- nchar(char_vec)
if (is.null(max_len)) max_len <- max(lengths)
diffs <- max_len - lengths
paste0(char_vec, reps(diffs))
}
> pad_strings(char_vec)
[1] "A0000000000000000000000000" "ABC00000000000000000000000"
[3] "ABCDEFG0000000000000000000" "ABCDEFGHIJKLMNOP0000000000"
[5] "AB000000000000000000000000" "ABCDEFGHI00000000000000000"
[7] "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "ABCDEFGHIJKL00000000000000"
[9] "ABCDEFGHIJKLMNOPQR00000000" "ABCDEFGHIJKLMNOP0000000000"
[11] "ABCDEFGHIJKLMNO00000000000"
If no argument is given for max_len=, then they are padded to the longest string. Otherwise the pad will be filled to max_len.

Batch Editing TXT to CSV in R

I have a series of .txt files that look like this:
Button,Intensity,Acc,Intensity,RT,Time
0,30,0,0,0,77987.931
1,30,1,13.5,0,78084.57
1,30,1,15,0,78098.624
1,30,1,6,0,78114.132
1,30,1,15,0,78120.669
They have file names like 1531_Day49.txt, 1531_Day50.txt, 1532_Day49.txt, 1532_Day50.txt etc
I want to load all the files in this directory into data frames, append a column that is the difference between the Time in the row above (tdelta), and append two columns that are the first 4 digits (i.e. 1531, 1532) and one column that's the Day code uncoded so the column title would be PrePost and each row would be, if filename Day49, then "Pre" and if filename Day50 then "Post".
So ideal output for a 1531 Day 49 file would be:
Button,Intensity,Acc,Intensity,RT,Time,Tdelta,ID,PrePost
0,30,0,0,0,77987.931,0 ,1531,Pre
1,30,1,13.5,0,78084.57,96.693 ,1531,Pre
1,30,1,15,0,78098.624, 14.054,1531,Pre
So far I have:
#call library
library(data.table)
#batch enter .txt files and put them into a data frame
setwd("~/Documents/PVTPASAT/PVT")
temp = list.files(pattern="*.txt")
list.DFs <- lapply(myfiles,fread)
#view print out to visually check
View(list.DFs)
#add column of time difference
list.DFs <- lapply(list.DFs, cbind, tDelta = c(0, diff(df$Time)))
#Add empty columns for ID and PrePost
list.DFs <- lapply(list.DFs, cbind, ID = c(""))
list.DFs <- lapply(list.DFs, cbind, PrePost = c(""))
#print one to visually check
View(list.DFs[3])
I would create a function to do the processing and then apply it to your list of files like so:
example <- read.delim(textConnection('
Button, Intensity, Acc, Intensity, RT, Time
0,30,0,0,0,77987.931
1,30,1,13.5,0,78084.57
1,30,1,15,0,78098.624
1,30,1,6,0,78114.132
1,30,1,15,0,78120.669'),
header = T,
sep = ','
)
write.table(example, '1531_Day49.txt', row.names = F)
temp <- list.files(pattern="*.txt")
process_txt <- function(x) {
dat <- data.table::fread(x, header = T)
dat$tdelta <- c(0, diff(dat$Time))
dat$ID <- substr(x, 1, 4)
dat$PrePost <- if (grepl('49\\.', x)) {'Pre'} else {'Post'}
dat
}
out <- lapply(temp, process_txt)
#Heather, the main guidance is to first solve properly one file. Then, place all that working code into a function.
library(dplyr) ## for lag function
library(stringr) ## for str_detect
# make two test files
dt <- read.csv(text=
'Button,Intensity,Acc,Intensity,RT,Time
0,30,0,0,0,77987.931
1,30,1,13.5,0,78084.57
1,30,1,15,0,78098.624
1,30,1,6,0,78114.132
1,30,1,15,0,78120.669
')
write.csv(dt,"1531_Day49.txt")
write.csv(dt,"1532_Day50.txt")
# function to do the work for one file name - returns a dataframe
doOne <- function (file) {
# read
contents <- fread(file)
# compute delta
contents$Tdelta <- contents$Time - lag(contents$Time)
# prefix up to underscore
contents$ID <- strsplit(file, c("_"))[[1]][[1]]
# add the prepost using ifelse and str_detetct
contents$PrePost <- ifelse(str_detect(file, "Day49"), "Pre", "Post")
return(contents)
}
#test files
files <- c("1531_Day49.txt", "1532_Day50.txt")
# call the function for each file -- result is
# a list of dataframes
lapply(files, doOne)
# better get them all into a single data frame for analysis
do.call(rbind, lapply(files, doOne))
# V1 Button Intensity Acc Intensity.1 RT Time Tdelta ID PrePost
# 1: 1 0 30 0 0.0 0 77987.93 NA 1531 Pre
# 2: 2 1 30 1 13.5 0 78084.57 96.639 1531 Pre
# 3: 3 1 30 1 15.0 0 78098.62 14.054 1531 Pre
# 4: 4 1 30 1 6.0 0 78114.13 15.508 1531 Pre
# 5: 5 1 30 1 15.0 0 78120.67 6.537 1531 Pre
# 6: 1 0 30 0 0.0 0 77987.93 NA 1532 Post
# 7: 2 1 30 1 13.5 0 78084.57 96.639 1532 Post
# 8: 3 1 30 1 15.0 0 78098.62 14.054 1532 Post
# 9: 4 1 30 1 6.0 0 78114.13 15.508 1532 Post
# 10: 5 1 30 1 15.0 0 78120.67 6.537 1532 Post

split dataframe with multiple delimiters in R

df1 <-
Gene GeneLocus
CPA1|1357 chr7:130020290-130027948:+
GUCY2D|3000 chr17:7905988-7923658:+
UBC|7316 chr12:125396194-125399577:-
C11orf95|65998 chr11:63527365-63536113:-
ANKMY2|57037 chr7:16639413-16685398:-
expected output
df2 <-
Gene.1 Gene.2 chr start end
CPA1 1357 7 130020290 130027948
GUCY2D 3000 17 7905988 7923658
UBC 7316 12 125396194 125399577
C11orf95 65998 11 63527365 63536113
ANKMY2 57037 7 16639413 16685398]]
I tried this way..
install.packages("splitstackshape")
library(splitstackshape)
df1 <- cSplit(df1,"Gene", sep="|", direction="wide", fixed=T)
df1 <- cSplit(df1,"GeneLocus",sep=":",direction="wide", fixed=T)
df1 <- cSplit(df1,"GeneLocus_2",sep="-",direction="wide", fixed=T)
df1 <- data.frame(df1)
df2$GeneLocus_1 <- gsub("chr","", df1$GeneLocus_1)
I would like to know if there is any other alternative way to do it in simpler way
Here you go...Just ignore the warning that does not affect the output; it actually has the side effect of removing the strand information (:+ or :-).
library(tidyr)
library(dplyr)
df1 %>% separate(Gene, c("Gene.1","Gene.2")) %>% separate(GeneLocus, c("chr","start","end")) %>% mutate(chr=sub("chr","",chr))
Output:
Gene.1 Gene.2 chr start end
1 CPA1 1357 7 130020290 130027948
2 GUCY2D 3000 17 7905988 7923658
3 UBC 7316 12 125396194 125399577
4 C11orf95 65998 11 63527365 63536113
5 ANKMY2 57037 7 16639413 16685398
I would suggest something like the following approach:
Make a single delimiter in your "GeneLocus" column (and strip out the unnecessary parts while you're at it).
Split both columns at once. Note that cSplit "balances" the columns being split according to the number of output columns detected. Thus, since the first column would only result in 2 columns when split, but the second would result in 4, you would need to drop columns 3 and 4 from the result.
library(splitstackshape)
GLPat <- "^chr(\\d+):(\\d+)-(\\d+):([+-])$"
cSplit(as.data.table(mydf)[, GeneLocus := gsub(
GLPat, "\\1|\\2|\\3|\\4", GeneLocus)], names(mydf), "|")[
, 3:4 := NULL, with = FALSE][]
# Gene_1 Gene_2 GeneLocus_1 GeneLocus_2 GeneLocus_3 GeneLocus_4
# 1: CPA1 1357 7 130020290 130027948 +
# 2: GUCY2D 3000 17 7905988 7923658 +
# 3: UBC 7316 12 125396194 125399577 -
# 4: C11orf95 65998 11 63527365 63536113 -
# 5: ANKMY2 57037 7 16639413 16685398 -
Alternatively, you can try col_flatten from my "SOfun" package, with which you can do:
library(SOfun)
Pat <- "^chr(\\d+):(\\d+)-(\\d+):([+-])$"
Fun <- function(invec) strsplit(gsub(Pat, "\\1|\\2|\\3|\\4", invec), "|", TRUE)
col_flatten(as.data.table(mydf)[, lapply(.SD, Fun)], names(mydf), drop = TRUE)
# Gene_1 Gene_2 GeneLocus_1 GeneLocus_2 GeneLocus_3 GeneLocus_4
# 1: CPA1 1357 7 130020290 130027948 +
# 2: GUCY2D 3000 17 7905988 7923658 +
# 3: UBC 7316 12 125396194 125399577 -
# 4: C11orf95 65998 11 63527365 63536113 -
# 5: ANKMY2 57037 7 16639413 16685398 -
SOfun is only on GitHub, so you can install it with:
source("http://news.mrdwab.com/install_github.R")
install_github("mrdwab/SOfun")

How to loop a function in a text file?

I have a function that I want to use it but the inputs are from a text file.
Here is the Fun:
myfun <- function(latitude,longitude) {
column =latitude*5
row =longitude*3
return(c(column, row))
}
Now I have a text file with information for my function.
cor=read.table("C:\\Data\\AMS.txt", sep="")
head(cor)
V1 V2 V3 V4 V5 V6
1 lat 13 lon 2 Site: As
2 lat 14 lon 3 Site: Ad
Output needed for instance:
lat lon column row site
13 2 ? ? As
I can do this manually but as I have many, it would be better to let R do it. Any hints are appreciated
Try:
data.frame(lat=DF$V2, lon=DF$V4, column=DF$V2*5, row=DF$V4*3, site=DF$V6)
No need for cleaning.
Data
DF <- read.table(text=" V1 V2 V3 V4 V5 V6
1 lat 13 lon 2 Site: As
2 lat 14 lon 3 Site: Ad", header=T)
> DF1 <- data.frame(lat=DF$V2, lon=DF$V4, column=DF$V2*5, row=DF$V4*3, site=DF$V6)
> DF1
lat lon column row site
1 13 2 65 6 As
2 14 3 70 9 Ad
Unless you need to use a function I would use a vectorised solution. First, I'd tidy up your data frame:
cor <- read.table("C:\\Data\\AMS.txt", sep="") # note <- not =
require("dplyr")
cor <- select(cor, -V1)
cor <- select(cor, -V3)
cor <- select(cor, -V5)
colnames(cor) <- c("lat", "long", "site")
Then I'd simply create a new variable for column and row:
cor$column <- cor$lat * 5
cor$row <- cor$long * 3
Yielding:
cor
# lat long site column row
# 1 13 2 As 65 6
# 2 14 3 Ad 70 9
EDIT: Based on your comments and edited post you clearly have a more complex function, which I've attempted to vectorise below. The output is for a vector of 5 items for each of columns and row, so hopefully that's your expected behaviour.
kR = 6371.228 # recommend constants start with 'k'
kC = 25.067525
kNc = 1383
kNl = 586
kR0 = (kNc - 1) / 2
kS0 = (kNl - 1) / 2
kDeg2rad_cte = pi/180
cor$lamda <- cor$lon * kDeg2rad_cte
cor$phi <- cor$lat * kDeg2rad_cte
column <- round(kR0 + (kR / kC) %*% cor$lamda * cos(pi / 6)) + 1
row <- 586 - round(kS0 - (kR / kC) %*% sin(cor$phi) / cos(pi / 6))
column <- seq(min(column), max(column), by=1)
row <- seq(min(row), max(row), by=1)
column
# [1] 700 701 702 703 704
row
# [1] 360 361 362 363 364

Calculate mean of each n-rows in a dataframe in r when the first row is varying

First make some example data:
df = data.frame(matrix(rnorm(200), nrow=100))
df1=data.frame(t(c(25,34)))
The starting row is different in each column. For example, in X1 I would like to start from 25 th row while in X2 from row 34. Then, I want to calculate the mean for each 5 values for the next 50 rows for all the columns in df.
I am new to R so this is probably very obvious. Can anyone provide some suggestions that how I can do this?
You could try Map.
lst <- Map(function(x,y) {x1 <- x[y:length(x)]
tapply(x1,as.numeric(gl(length(x1), 5,
length(x1))), FUN=mean)},
df, df1)
lst
# $X1
# 1 2 3 4 5 6
#-0.16500158 0.11339623 -0.86961872 -0.54985564 0.19958461 0.35234983
# 7 8 9 10 11 12
#0.32792769 0.65989801 -0.30409184 -0.53264725 -0.45792792 -0.59139844
# 13 14 15 16
# 0.03934133 -0.38068187 0.10100007 1.21017392
#$X2
# 1 2 3 4 5 6
# 0.24525622 0.07367300 0.18733973 -0.43784202 -0.45756095 -0.45740178
# 7 8 9 10 11 12
#-0.54086152 0.10439072 0.65660937 0.70623380 -0.51640088 0.46506135
# 13 14
#-0.09428336 -0.86295101
Because of the length difference, it might be better to keep it as a list. But, if you need it in a matrix/data.frame, you can make the lengths equal by padding with NAs.
do.call(cbind,lapply(lst, `length<-`,(max(sapply(lst, length)))))
Update
If you need only 50 rows, then change y:(length(x) to y:(y+49) in the Map code
data
set.seed(24)
df <- data.frame(matrix(rnorm(200), nrow=100))
df1 <- data.frame(t(c(25,34)))
Not entirely clear, especially, the second line of your code, but I think this might be close to what you want to do:
every_fifth_row <- df[seq(1, nrow(df), 5), ]
every_fifth_row
# X1 X2
# 1 -0.09490455 -0.28417104
# 6 -0.14949662 0.12857284
# 11 0.15297366 -0.84428186
# 16 -1.03397309 0.04775516
# 21 -1.95735213 -1.03750794
# 26 1.61135194 1.10189370
# 31 0.12447365 1.80792719
# 36 -0.92344017 0.66639710
# 41 -0.88764143 0.10858376
# 46 0.27761464 0.98382526
# 51 -0.14503359 -0.66868956
# 56 -1.70208187 0.05993688
# 61 0.33828525 1.00208639
# 66 -0.41427863 1.07969341
# 71 0.35027994 -1.46920059
# 76 1.38943839 0.01844205
# 81 -0.81560917 -0.32133221
# 86 1.38188423 -0.77755471
# 91 1.53247872 -0.98660308
# 96 0.45721909 -0.22855622
rowMeans(every_fifth_row)
colMeans(every_fifth_row)
# Alternative
# apply(every_fifth_row, 1, mean) # Row-wise mean
# apply(every_fifth_row, 2, mean) # Column-wise mean

Resources