How to loop a function in a text file? - r

I have a function that I want to use it but the inputs are from a text file.
Here is the Fun:
myfun <- function(latitude,longitude) {
column =latitude*5
row =longitude*3
return(c(column, row))
}
Now I have a text file with information for my function.
cor=read.table("C:\\Data\\AMS.txt", sep="")
head(cor)
V1 V2 V3 V4 V5 V6
1 lat 13 lon 2 Site: As
2 lat 14 lon 3 Site: Ad
Output needed for instance:
lat lon column row site
13 2 ? ? As
I can do this manually but as I have many, it would be better to let R do it. Any hints are appreciated

Try:
data.frame(lat=DF$V2, lon=DF$V4, column=DF$V2*5, row=DF$V4*3, site=DF$V6)
No need for cleaning.
Data
DF <- read.table(text=" V1 V2 V3 V4 V5 V6
1 lat 13 lon 2 Site: As
2 lat 14 lon 3 Site: Ad", header=T)
> DF1 <- data.frame(lat=DF$V2, lon=DF$V4, column=DF$V2*5, row=DF$V4*3, site=DF$V6)
> DF1
lat lon column row site
1 13 2 65 6 As
2 14 3 70 9 Ad

Unless you need to use a function I would use a vectorised solution. First, I'd tidy up your data frame:
cor <- read.table("C:\\Data\\AMS.txt", sep="") # note <- not =
require("dplyr")
cor <- select(cor, -V1)
cor <- select(cor, -V3)
cor <- select(cor, -V5)
colnames(cor) <- c("lat", "long", "site")
Then I'd simply create a new variable for column and row:
cor$column <- cor$lat * 5
cor$row <- cor$long * 3
Yielding:
cor
# lat long site column row
# 1 13 2 As 65 6
# 2 14 3 Ad 70 9
EDIT: Based on your comments and edited post you clearly have a more complex function, which I've attempted to vectorise below. The output is for a vector of 5 items for each of columns and row, so hopefully that's your expected behaviour.
kR = 6371.228 # recommend constants start with 'k'
kC = 25.067525
kNc = 1383
kNl = 586
kR0 = (kNc - 1) / 2
kS0 = (kNl - 1) / 2
kDeg2rad_cte = pi/180
cor$lamda <- cor$lon * kDeg2rad_cte
cor$phi <- cor$lat * kDeg2rad_cte
column <- round(kR0 + (kR / kC) %*% cor$lamda * cos(pi / 6)) + 1
row <- 586 - round(kS0 - (kR / kC) %*% sin(cor$phi) / cos(pi / 6))
column <- seq(min(column), max(column), by=1)
row <- seq(min(row), max(row), by=1)
column
# [1] 700 701 702 703 704
row
# [1] 360 361 362 363 364

Related

Batch Editing TXT to CSV in R

I have a series of .txt files that look like this:
Button,Intensity,Acc,Intensity,RT,Time
0,30,0,0,0,77987.931
1,30,1,13.5,0,78084.57
1,30,1,15,0,78098.624
1,30,1,6,0,78114.132
1,30,1,15,0,78120.669
They have file names like 1531_Day49.txt, 1531_Day50.txt, 1532_Day49.txt, 1532_Day50.txt etc
I want to load all the files in this directory into data frames, append a column that is the difference between the Time in the row above (tdelta), and append two columns that are the first 4 digits (i.e. 1531, 1532) and one column that's the Day code uncoded so the column title would be PrePost and each row would be, if filename Day49, then "Pre" and if filename Day50 then "Post".
So ideal output for a 1531 Day 49 file would be:
Button,Intensity,Acc,Intensity,RT,Time,Tdelta,ID,PrePost
0,30,0,0,0,77987.931,0 ,1531,Pre
1,30,1,13.5,0,78084.57,96.693 ,1531,Pre
1,30,1,15,0,78098.624, 14.054,1531,Pre
So far I have:
#call library
library(data.table)
#batch enter .txt files and put them into a data frame
setwd("~/Documents/PVTPASAT/PVT")
temp = list.files(pattern="*.txt")
list.DFs <- lapply(myfiles,fread)
#view print out to visually check
View(list.DFs)
#add column of time difference
list.DFs <- lapply(list.DFs, cbind, tDelta = c(0, diff(df$Time)))
#Add empty columns for ID and PrePost
list.DFs <- lapply(list.DFs, cbind, ID = c(""))
list.DFs <- lapply(list.DFs, cbind, PrePost = c(""))
#print one to visually check
View(list.DFs[3])
I would create a function to do the processing and then apply it to your list of files like so:
example <- read.delim(textConnection('
Button, Intensity, Acc, Intensity, RT, Time
0,30,0,0,0,77987.931
1,30,1,13.5,0,78084.57
1,30,1,15,0,78098.624
1,30,1,6,0,78114.132
1,30,1,15,0,78120.669'),
header = T,
sep = ','
)
write.table(example, '1531_Day49.txt', row.names = F)
temp <- list.files(pattern="*.txt")
process_txt <- function(x) {
dat <- data.table::fread(x, header = T)
dat$tdelta <- c(0, diff(dat$Time))
dat$ID <- substr(x, 1, 4)
dat$PrePost <- if (grepl('49\\.', x)) {'Pre'} else {'Post'}
dat
}
out <- lapply(temp, process_txt)
#Heather, the main guidance is to first solve properly one file. Then, place all that working code into a function.
library(dplyr) ## for lag function
library(stringr) ## for str_detect
# make two test files
dt <- read.csv(text=
'Button,Intensity,Acc,Intensity,RT,Time
0,30,0,0,0,77987.931
1,30,1,13.5,0,78084.57
1,30,1,15,0,78098.624
1,30,1,6,0,78114.132
1,30,1,15,0,78120.669
')
write.csv(dt,"1531_Day49.txt")
write.csv(dt,"1532_Day50.txt")
# function to do the work for one file name - returns a dataframe
doOne <- function (file) {
# read
contents <- fread(file)
# compute delta
contents$Tdelta <- contents$Time - lag(contents$Time)
# prefix up to underscore
contents$ID <- strsplit(file, c("_"))[[1]][[1]]
# add the prepost using ifelse and str_detetct
contents$PrePost <- ifelse(str_detect(file, "Day49"), "Pre", "Post")
return(contents)
}
#test files
files <- c("1531_Day49.txt", "1532_Day50.txt")
# call the function for each file -- result is
# a list of dataframes
lapply(files, doOne)
# better get them all into a single data frame for analysis
do.call(rbind, lapply(files, doOne))
# V1 Button Intensity Acc Intensity.1 RT Time Tdelta ID PrePost
# 1: 1 0 30 0 0.0 0 77987.93 NA 1531 Pre
# 2: 2 1 30 1 13.5 0 78084.57 96.639 1531 Pre
# 3: 3 1 30 1 15.0 0 78098.62 14.054 1531 Pre
# 4: 4 1 30 1 6.0 0 78114.13 15.508 1531 Pre
# 5: 5 1 30 1 15.0 0 78120.67 6.537 1531 Pre
# 6: 1 0 30 0 0.0 0 77987.93 NA 1532 Post
# 7: 2 1 30 1 13.5 0 78084.57 96.639 1532 Post
# 8: 3 1 30 1 15.0 0 78098.62 14.054 1532 Post
# 9: 4 1 30 1 6.0 0 78114.13 15.508 1532 Post
# 10: 5 1 30 1 15.0 0 78120.67 6.537 1532 Post

Calculate mean of each n-rows in a dataframe in r when the first row is varying

First make some example data:
df = data.frame(matrix(rnorm(200), nrow=100))
df1=data.frame(t(c(25,34)))
The starting row is different in each column. For example, in X1 I would like to start from 25 th row while in X2 from row 34. Then, I want to calculate the mean for each 5 values for the next 50 rows for all the columns in df.
I am new to R so this is probably very obvious. Can anyone provide some suggestions that how I can do this?
You could try Map.
lst <- Map(function(x,y) {x1 <- x[y:length(x)]
tapply(x1,as.numeric(gl(length(x1), 5,
length(x1))), FUN=mean)},
df, df1)
lst
# $X1
# 1 2 3 4 5 6
#-0.16500158 0.11339623 -0.86961872 -0.54985564 0.19958461 0.35234983
# 7 8 9 10 11 12
#0.32792769 0.65989801 -0.30409184 -0.53264725 -0.45792792 -0.59139844
# 13 14 15 16
# 0.03934133 -0.38068187 0.10100007 1.21017392
#$X2
# 1 2 3 4 5 6
# 0.24525622 0.07367300 0.18733973 -0.43784202 -0.45756095 -0.45740178
# 7 8 9 10 11 12
#-0.54086152 0.10439072 0.65660937 0.70623380 -0.51640088 0.46506135
# 13 14
#-0.09428336 -0.86295101
Because of the length difference, it might be better to keep it as a list. But, if you need it in a matrix/data.frame, you can make the lengths equal by padding with NAs.
do.call(cbind,lapply(lst, `length<-`,(max(sapply(lst, length)))))
Update
If you need only 50 rows, then change y:(length(x) to y:(y+49) in the Map code
data
set.seed(24)
df <- data.frame(matrix(rnorm(200), nrow=100))
df1 <- data.frame(t(c(25,34)))
Not entirely clear, especially, the second line of your code, but I think this might be close to what you want to do:
every_fifth_row <- df[seq(1, nrow(df), 5), ]
every_fifth_row
# X1 X2
# 1 -0.09490455 -0.28417104
# 6 -0.14949662 0.12857284
# 11 0.15297366 -0.84428186
# 16 -1.03397309 0.04775516
# 21 -1.95735213 -1.03750794
# 26 1.61135194 1.10189370
# 31 0.12447365 1.80792719
# 36 -0.92344017 0.66639710
# 41 -0.88764143 0.10858376
# 46 0.27761464 0.98382526
# 51 -0.14503359 -0.66868956
# 56 -1.70208187 0.05993688
# 61 0.33828525 1.00208639
# 66 -0.41427863 1.07969341
# 71 0.35027994 -1.46920059
# 76 1.38943839 0.01844205
# 81 -0.81560917 -0.32133221
# 86 1.38188423 -0.77755471
# 91 1.53247872 -0.98660308
# 96 0.45721909 -0.22855622
rowMeans(every_fifth_row)
colMeans(every_fifth_row)
# Alternative
# apply(every_fifth_row, 1, mean) # Row-wise mean
# apply(every_fifth_row, 2, mean) # Column-wise mean

Correct way of vectorizing "lookup" function

I am looking for a fast and efficient way to compute the problem described below. Any help would be appreciated, thanks in advance!
I have a couple of very large csv files that have different information about the same object, but in my final calculation I need all of the attributes in the different table. I am trying to calculate the load of a large number of electrical substations, first I have a list of unique electrical substations;
Unique_Substations <- data.frame(Name = c("SubA", "SubB", "SubC", "SubD"))
In another list I have information about the customers behind these substations;
Customer_Information <- data.frame(
Customer = 1001:1010,
SubSt_Nm = sample(unique(Unique_Substations$Name), 10, replace = TRUE),
HouseHoldType = sample(1:2, 10, replace = TRUE)
)
And in another list I have information about the, let's say, solar panels on these customers roofs (for different years);
Solar_Panels <- data.frame(
Customer = sample(1001:1010, 10, replace = TRUE),
SolarPanelYear1 = sample(10:20, 10, replace = TRUE),
SolarPanelYear2 = sample(15:20, 10, replace = TRUE)
)
Now I want see what the load is for each substation for each year. I have a household load and a solar panel load normalised for each type of household or the solarpanel;
SolarLoad <- data.frame(Load = c(0, -10, -10, 5))
HouseHoldLoad <- data.frame(Type1 = c(1, 3, 5, 2), Type2 = c(3, 5, 6, 1))
So now I have to match up these lists;
ML_SubSt_Cust <- sapply(Unique_Substations$Name,
function(x) which(Customer_Information$SubSt_Nm %in% x == TRUE))
ML_Cust_SolarP <- sapply(Customer_Information$Customer,
function(x) which(Solar_Panels$Customer %in% x == TRUE))
(Here I use the which(xxx %in% x == TRUE) method because I need multiple matches and match() only returns one match
And now we come to my big question (but probably not my only problem with this method) at last. I want to calculate the maximum load on each substation for each year. To this end I had first written a for loop that looped through the Unique_Substations list, which is of course highly inefficient. After that I tried to speed it up using outer() but I don't think I have properly vectorized my function. My maximum function looks as follows (I only wrote it out for the solar panel part to keep it simple);
GetMax <- function(i, Yr) {
max(sum(Solar_Panels[unlist(ML_Cust_SolarP[ML_SubSt_Cust[[i]]], use.names= FALSE),Yr])*SolarLoad)
}
I'm sure this is not efficient at all but I have no clue how to do it in any other way.
To get my final results I use a outer function;
Results <- outer(1:nrow(Unique_Substations), 1:2, Vectorize(GetMax))
In my example all of these data frames are much much larger (40000 rows each or so), so I really need some good optimization of the functions involved. I tried to think of ways to vectorize the function but I couldn't work it out. Any help would be appreciated.
EDIT:
Now that I fully understand the accepted awnser I have another problem. My actual Customer_Information is 188k rows long and my actual HouseHoldLoad is 53k rows long. Needless to say this does not merge() very well. Is there another solution to this problem that does not require merge() or for loops that are too slow?
First: set.seed() when generating random data! I did set.seed(1000) before your code for these results.
I think a bit of merge-ing and dplyr can help here. First, we get the data into a better shape:
library(dplyr)
library(reshape2)
HouseHoldLoad <- melt(HouseHoldLoad, value.name="Load") %>%
select(HouseHoldType=variable, Load) %>%
mutate(HouseHoldType=gsub("Type", "", HouseHoldType))
Solar_Panels <- melt(Solar_Panels, id.vars="Customer",
value.name="SPYearVal") %>%
select(Customer, SolarPanelYear=variable, SPYearVal) %>%
mutate(SolarPanelYear=gsub("SolarPanelYear", "", SolarPanelYear))
dat <- merge(Customer_Information, Solar_Panels, by="Customer")
That gives us:
## Customer SubSt_Nm HouseHoldType SolarPanelYear SPYearVal
## 1 1001 SubB 1 1 16
## 2 1001 SubB 1 2 18
## 3 1001 SubB 1 2 16
## 4 1001 SubB 1 1 20
## 5 1002 SubD 2 1 16
## 6 1002 SubD 2 1 13
## 7 1002 SubD 2 2 20
## 8 1002 SubD 2 2 18
## 9 1003 SubA 1 2 15
## 10 1003 SubA 1 1 16
## 11 1005 SubC 2 2 19
## 12 1005 SubC 2 1 10
## 13 1006 SubA 1 1 15
## 14 1006 SubA 1 2 19
## 15 1007 SubC 1 1 17
## 16 1007 SubC 1 2 19
## 17 1009 SubA 1 1 10
## 18 1009 SubA 1 1 18
## 19 1009 SubA 1 2 18
## 20 1009 SubA 1 2 18
Now we just group and summarize:
dat %>% group_by(SubSt_Nm, SolarPanelYear) %>%
summarise(mx=max(sum(SPYearVal)*SolarLoad))
## SubSt_Nm SolarPanelYear mx
## 1 SubA 1 295
## 2 SubA 2 350
## 3 SubB 1 180
## 4 SubB 2 170
## 5 SubC 1 135
## 6 SubC 2 190
## 7 SubD 1 145
## 8 SubD 2 190
If you use data.table vs data frames, it should be pretty speedy even with 40K entries.
UPDATE For those who cannot install dplyr, this just uses reshape2 (hopefully that is installable)
library(reshape2)
HouseHoldLoad <- melt(HouseHoldLoad, value.name="Load")
colnames(HouseHoldLoad) <- c("HouseHoldType", "Load")
HouseHoldLoad$HouseHoldType <- gsub("Type", "", HouseHoldLoad$HouseHoldType)
Solar_Panels <- melt(Solar_Panels, id.vars="Customer", value.name="SPYearVal")
colnames(Solar_Panels) <- c("Customer", "SolarPanelYear", "SPYearVal")
Solar_Panels$SolarPanelYear <- gsub("SolarPanelYear", "", Solar_Panels$SolarPanelYear)
dat <- merge(Customer_Information, Solar_Panels, by="Customer")
rbind(by(dat, list(dat$SubSt_Nm, dat$SolarPanelYear), function(x) {
mx <- max(sum(x$SPYearVal) * SolarLoad)
}))
## 1 2
## SubA 295 350
## SubB 180 170
## SubC 135 190
## SubD 145 190
If you really can't install even reshape2, then this works with just the base stats package:
colnames(HouseHoldLoad) <- c("Load.1", "Load.2")
HouseHoldLoad <- reshape(HouseHoldLoad, varying=c("Load.1", "Load.2"), direction="long", timevar="HouseHoldType")[1:2]
colnames(Solar_Panels) <- c("Customer", "SolarPanelYear.1", "SolarPanelYear.2")
Solar_Panels <- reshape(Solar_Panels, varying=c("SolarPanelYear.1", "SolarPanelYear.2"), direction="long", timevar="SolarPanelYear")[1:2]
colnames(Solar_Panels) <- c("Customer", "SPYearVal")
Solar_Panels$SolarPanelYear <- gsub("^[0-9]+\\.", "", rownames(Solar_Panels))
dat <- merge(Customer_Information, Solar_Panels, by="Customer")
rbind(by(dat, list(dat$SubSt_Nm, dat$SolarPanelYear), function(x) {
mx <- max(sum(x$SPYearVal) * SolarLoad)
}))
## 1 2
## SubA 295 350
## SubB 180 170
## SubC 135 190
## SubD 145 190

R: How do I create k subsets subject to a condition applied to k columns?

Example:
fdistr[ which(fdistm[,4]<=5),]
zip long lat 1 2 3 4 5
32403 72756 -94.07141 36.34224 3 122 19401 51 704
32404 72757 -94.11565 36.37198 2 141 19546 45 740
32405 72758 -94.14622 36.30662 1 149 19578 30 724
fdistr[ which(fdistm[,5]<=5),]
zip long lat 1 2 3 4 5
32312 72601 -93.09345 36.23698 181 1 16548 328 354
fdistr[ which(fdistm[,6]<=5),]
zip long lat 1 2 3 4 5
13271 30529 -83.46842 34.21042 16867 15514 1 17241 12593
How do I loop over k columns to create k subsets?
Thanks!
Use lapply as follows:
k <- 5
subsets <- lapply(3 + 1:k, function(x, i, m) x[x[, i] <= m, ], x = fdistm, m = 5)
It will return a list of k subsets.
Use apply and assuming your dataframe is fdistm(Not tested)
fdistm[apply(fdistm[, -1], MARGIN = 1, function(x) all(x <=5)), ]
Note:applies to all except the first

Reading a file in R space separated, skip first lines

I'd like to read a file in R into a matrix of M by N.
The file is of the following form:
# /n/home11/tros/sar/tests/mars/abro 250
# /n/home11/tros/sar/tests/mars/abro 230
# /n/home11/tros/sar/tests/mars/abro 20
# /n/home11/tros/sar/tests/mars/abro 20
# T (M rows,N cols)
# M 3
# N 4
7.947363550e+03 1.066183995e+04 3.896434554e+03 8.319875735e+03
1.600281531e+04 1.991086422e+04 1.628421819e+03 1.239507171e+04
7.430547003e+03 2.349262184e+03 4.883555574e+03 4.986597752e+02
The first lines (all lines with # sign) should be skipped, but M and N could (potentially) be read from the header (lines with #) lines.
Then a numeric matrix of dimensions M by N (3 by 4 in this case) should be read, note that the separator is just space (NOT tabs).
Thanks.
read.table will skip lines starting with # by default:
s <- "# /n/home11/tros/sar/tests/mars/abro 250
# /n/home11/tros/sar/tests/mars/abro 230
# /n/home11/tros/sar/tests/mars/abro 20
# /n/home11/tros/sar/tests/mars/abro 20
# T (M rows,N cols)
# M 3
# N 4
7.947363550e+03 1.066183995e+04 3.896434554e+03 8.319875735e+03
1.600281531e+04 1.991086422e+04 1.628421819e+03 1.239507171e+04
7.430547003e+03 2.349262184e+03 4.883555574e+03 4.986597752e+02
"
read.table(header=FALSE, text=s)
## V1 V2 V3 V4
## 1 7947.364 10661.840 3896.435 8319.8757
## 2 16002.815 19910.864 1628.422 12395.0717
## 3 7430.547 2349.262 4883.556 498.6598
Rather than using text= you will probably want to use file= and supply a file name from which to read the data.
Lines <- readLines(s)
M <- as.numeric( sub("^#\\sM" ,"" , Lines[grep("^#\\sM",Lines)]) )
M
#[1] 3
N <- as.numeric( sub("^#\\sN" ,"" , Lines[grep("^#\\sN",Lines)]) )
dat <- read.table(text=Lines[grep("^[^#]",Lines)])
dat
V1 V2 V3 V4
1 7947.364 10661.840 3896.435 8319.8757
2 16002.815 19910.864 1628.422 12395.0717
3 7430.547 2349.262 4883.556 498.6598

Resources