Loop to Create and save dataframes into list - r

I have example code and data below. What I'm trying to figure out is how write a loop that would create say x (in this example x=3) dataframes from a dataframe (in this example datadf) and save those dataframes in a list. The main part I'm stuck on is how to save each dataframe into a list. Any tips are greatly appreciated.
The updated code below seems to just about work, except the beginning index on the dataframes always stays at 1, instead of stepping 10 ahead each time. Anybody know what the issue is?
Update:
N<-3
x<-vector("list",N)
for (i in 1:N)
{
a<-(1:100)*rnorm(1,0.5)
b<-(1:100)*rnorm(1,2)
datadf<-as.data.frame(cbind(a,b))
n<-10
t<-50
datadfn<-datadf[((i-1)*n+1):(t+2*(i-1)*n),]
x[[i]]<-datadfn
}
Example Code:
n<-10
t<-50
datadf1<-datadf[1:t,]
datadf2<-datadf[(n+1):(t+n),]
datadf3<-datadf[(2*n+1):(t+2*n),]
dfList<-list(datadf1, datadf2, datadf3)
Data:
dput(datadf)
structure(list(a = c(2.00134717160119, 4.00269434320238, 6.00404151480358,
8.00538868640477, 10.006735858006, 12.0080830296072, 14.0094302012083,
16.0107773728095, 18.0121245444107, 20.0134717160119, 22.0148188876131,
24.0161660592143, 26.0175132308155, 28.0188604024167, 30.0202075740179,
32.0215547456191, 34.0229019172203, 36.0242490888215, 38.0255962604226,
40.0269434320238, 42.028290603625, 44.0296377752262, 46.0309849468274,
48.0323321184286, 50.0336792900298, 52.035026461631, 54.0363736332322,
56.0377208048334, 58.0390679764346, 60.0404151480358, 62.041762319637,
64.0431094912381, 66.0444566628393, 68.0458038344405, 70.0471510060417,
72.0484981776429, 74.0498453492441, 76.0511925208453, 78.0525396924465,
80.0538868640477, 82.0552340356489, 84.0565812072501, 86.0579283788513,
88.0592755504524, 90.0606227220536, 92.0619698936548, 94.063317065256,
96.0646642368572, 98.0660114084584, 100.06735858006, 102.068705751661,
104.070052923262, 106.071400094863, 108.072747266464, 110.074094438066,
112.075441609667, 114.076788781268, 116.078135952869, 118.07948312447,
120.080830296072, 122.082177467673, 124.083524639274, 126.084871810875,
128.086218982476, 130.087566154077, 132.088913325679, 134.09026049728,
136.091607668881, 138.092954840482, 140.094302012083, 142.095649183685,
144.096996355286, 146.098343526887, 148.099690698488, 150.101037870089,
152.102385041691, 154.103732213292, 156.105079384893, 158.106426556494,
160.107773728095, 162.109120899697, 164.110468071298, 166.111815242899,
168.1131624145, 170.114509586101, 172.115856757703, 174.117203929304,
176.118551100905, 178.119898272506, 180.121245444107, 182.122592615708,
184.12393978731, 186.125286958911, 188.126634130512, 190.127981302113,
192.129328473714, 194.130675645316, 196.132022816917, 198.133369988518,
200.134717160119), b = c(2.05061146723527, 4.10122293447054,
6.15183440170581, 8.20244586894108, 10.2530573361764, 12.3036688034116,
14.3542802706469, 16.4048917378822, 18.4555032051174, 20.5061146723527,
22.556726139588, 24.6073376068232, 26.6579490740585, 28.7085605412938,
30.7591720085291, 32.8097834757643, 34.8603949429996, 36.9110064102349,
38.9616178774701, 41.0122293447054, 43.0628408119407, 45.113452279176,
47.1640637464112, 49.2146752136465, 51.2652866808818, 53.315898148117,
55.3665096153523, 57.4171210825876, 59.4677325498228, 61.5183440170581,
63.5689554842934, 65.6195669515287, 67.6701784187639, 69.7207898859992,
71.7714013532345, 73.8220128204697, 75.872624287705, 77.9232357549403,
79.9738472221756, 82.0244586894108, 84.0750701566461, 86.1256816238814,
88.1762930911166, 90.2269045583519, 92.2775160255872, 94.3281274928224,
96.3787389600577, 98.429350427293, 100.479961894528, 102.530573361764,
104.581184828999, 106.631796296234, 108.682407763469, 110.733019230705,
112.78363069794, 114.834242165175, 116.88485363241, 118.935465099646,
120.986076566881, 123.036688034116, 125.087299501351, 127.137910968587,
129.188522435822, 131.239133903057, 133.289745370293, 135.340356837528,
137.390968304763, 139.441579771998, 141.492191239234, 143.542802706469,
145.593414173704, 147.644025640939, 149.694637108175, 151.74524857541,
153.795860042645, 155.846471509881, 157.897082977116, 159.947694444351,
161.998305911586, 164.048917378822, 166.099528846057, 168.150140313292,
170.200751780527, 172.251363247763, 174.301974714998, 176.352586182233,
178.403197649469, 180.453809116704, 182.504420583939, 184.555032051174,
186.60564351841, 188.656254985645, 190.70686645288, 192.757477920115,
194.808089387351, 196.858700854586, 198.909312321821, 200.959923789056,
203.010535256292, 205.061146723527)), .Names = c("a", "b"), row.names = c(NA,
-100L), class = "data.frame")

Simply change your second expression (t+2*(i-1)*n) to (t+(i-1)*n) or to align with first expression ((i-1)*n+t). Also, consider lapply over the for loop as its return is a list equal to input seq(N) or [1] 1 2 3:
N <- 3
n<-10
t<-50
dfList <- lapply(seq(N), function(i) {
a <- (1:100)*rnorm(1,0.5)
b <- (1:100)*rnorm(1,2)
datadf <- as.data.frame(cbind(a,b))
datadf[((i-1)*n+1):((i-1)*n+t),]
})
Or an easier read:
dfList <- lapply(seq(N), function(i) {
a <- (1:100)*rnorm(1,0.5)
b <- (1:100)*rnorm(1,2)
s <- (i-1)*n
datadf <- as.data.frame(cbind(a,b))
datadf[(s+1):(s+t),]
})

Related

Calling variable works outside of loop, but not inside loop

I am looping through vars_macro. The first variable in vars_macro is c1372 (dput below). The below code works perfectly fine:
len <- 32
c1372[1:(len-z),1:1]
However when I try to call the same variable (c1372) in code below, I get an error:
Error in m[1:(len - z), 1:1] : incorrect number of dimensions
Code:
output <- list()
forecast <- list()
for(m in noquote(vars_macro)){
output[[m]] <- list() # treat output as a list-of-lists
fit[[m]] <- list() # treat fit as a list-of-lists
for(z in rev(1:6)) {
x <- m[1:(len-z),1:1]
x <- ts((x), start = c(2011, 4), frequency = 4)
y <- Macro[1:(len-z),2:2]
y <- ts((y), start = c(2011, 4), frequency = 4)
t <- Macro[(len+1-z):(len+1-z),3:10]
t <- ts((t), start = c(2019, 2), frequency = 4)
#fit model
fit[[m]][[z]] <-auto.arima(y,xreg=x,seasonal=TRUE)
output[[m]][[z]] <- forecast(fit[[m]][[z]],xreg=t)$mean
}
}
Note the code above fails on the first variable (c1372), so the issue isn't the other variables. You can confirm this by simply writing for(m in ("c1372"))
Dput:
dput(vars_macro)
c("c1372", "c5244", "c5640", "c6164", "b1372", "b5244", "b5640",
"b6164", "v1372", "v5244", "v5640", "v6164", "bv1372", "bv5244",
"bv5640", "bv6164")
dput(c1372)
structure(list(c1372 = c(1.386445329, 1.600103663, 1.906186443,
1.962067415, 2.716663882, 1.875961101, 2.086589462, 2.115101307,
2.960605275, 2.109288864, 2.730920081, 2.816577742, 4.006180002,
3.503741762, 4.162132837, 4.122407811, 5.352681171, 3.961705849,
4.773003078, 4.575654378, 5.71727247, 4.401603262, 5.204187541,
4.7354794, 5.809822373, 4.137968937, 4.881120131, 4.812274313,
6.143882981, 4.935116748, 5.95001413, 5.384694268)), row.names = c(NA,
-32L), class = "data.frame")
The code in the OP fails because once the line for(m in noquote(vars_macro)) executes for the first time, m is set to a single element character vector, c1372.
Therefore, x <- m[1:(len-z),1:1] fails because m is a single element character vector, not a data frame with 32 rows and one column.
In R, everything is an object, and it's important to know the types of objects one is manipulating. The mechanism to move back and forth between a character vector and an actual object are the two R functions, get() and assign().
assign() assigns a name with an object. get() retrieves an object, given a name.
If what is required is to access the c1372 data frame instead of the character vector c1372, one needs to use the get() function to get a named object.
Illustrating with the data provided in the OP:
vars_macro <- "c1372"
c1372 <- structure(list(c1372 = c(1.386445329, 1.600103663, 1.906186443,
1.962067415, 2.716663882, 1.875961101, 2.086589462, 2.115101307,
2.960605275, 2.109288864, 2.730920081, 2.816577742, 4.006180002,
3.503741762, 4.162132837, 4.122407811, 5.352681171, 3.961705849,
4.773003078, 4.575654378, 5.71727247, 4.401603262, 5.204187541,
4.7354794, 5.809822373, 4.137968937, 4.881120131, 4.812274313,
6.143882981, 4.935116748, 5.95001413, 5.384694268)), row.names = c(NA,
-32L), class = "data.frame")
len <- 32
theData <- NULL
for (m in vars_macro){
theData <- get(m)
}
# print first few rows to show that get() worked
head(theData)
...and the output:
> # print first few rows to show that get() worked
> head(theData)
c1372
1 1.386445
2 1.600104
3 1.906186
4 1.962067
5 2.716664
6 1.875961
>

svm file handling in R

I have a multi-label classification problem. I have a dataset available at the following link:
dataset
This data set is originally from siam competition 2007. The dataset comprises of aviation safety reports describing the problem(s) which occurred in certain flights. It is a multi-classification, high dimensional problem. It has 21519 rows and 30438 columns.
The dataset contains .svm format file.
I have read the file with the help of "read.delim" in R.
After that I got following output:
head(data[,1])
1 18 2:0.136082763488 6:0.136082763488 7:0.136082763488
12:0.136082763488 20:0.136082763488 23:0.136082763488
32:0.136082763488 37:0.136082763488 39:0.136082763488
43:0.136082763488 53:0.136082763488 57:0.136082763488
58:0.136082763488 59:0.136082763488 60:0.136082763488
61:0.136082763488 62:0.136082763488 63:0.136082763488
64:0.136082763488 65:0.136082763488 66:0.136082763488
67:0.136082763488 68:0.136082763488 69:0.136082763488
70:0.136082763488 71:0.136082763488 72:0.136082763488
73:0.136082763488 74:0.136082763488 75:0.136082763488
76:0.136082763488 77:0.136082763488 78:0.136082763488
79:0.136082763488 80:0.136082763488 81:0.136082763488
82:0.136082763488 83:0.136082763488 84:0.136082763488
85:0.136082763488 86:0.136082763488 87:0.136082763488
88:0.136082763488 89:0.136082763488 90:0.136082763488
91:0.136082763488 92:0.136082763488 93:0.136082763488
94:0.136082763488 95:0.136082763488 96:0.136082763488
97:0.136082763488 98:0.136082763488 99:0.136082763488
[2] 1,12,13,18,20 2:0.0916698497028 4:0.0916698497028
6:0.0916698497028 12:0.0916698497028 14:0.0916698497028
16:0.0916698497028 19:0.0916698497028 23:0.0916698497028
26:0.0916698497028 31:0.0916698497028 32:0.0916698497028
33:0.0916698497028 37:0.0916698497028 53:0.0916698497028
57:0.0916698497028 66:0.0916698497028 71:0.0916698497028
72:0.0916698497028 81:0.0916698497028 83:0.0916698497028
84:0.0916698497028 86:0.0916698497028 90:0.0916698497028
92:0.0916698497028 100:0.0916698497028 101:0.0916698497028
102:0.0916698497028 103:0.0916698497028 104:0.0916698497028
105:0.0916698497028 106:0.0916698497028 107:0.0916698497028
108:0.0916698497028 109:0.0916698497028 110:0.0916698497028
111:0.0916698497028 112:0.0916698497028 113:0.0916698497028
114:0.0916698497028 115:0.0916698497028 116:0.0916698497028
117:0.0916698497028 118:0.0916698497028 119:0.0916698497028
120:0.0916698497028 121:0.0916698497028 122:0.0916698497028
123:0.0916698497028 124:0.0916698497028 125:0.0916698497028
126:0.0916698497028 127:0.0916698497028 128:0.0916698497028
129:0.0916698497028 130:0.0916698497028 131:0.0916698497028
132:0.0916698497028 133:0.0916698497028 134:0.0916698497028
135:0.0916698497028 136:0.0916698497028 137:0.0916698497028
138:0.0916698497028 139:0.0916698497028 140:0.0916698497028
141:0.0916698497028 142:0.0916698497028 143:0.0916698497028
144:0.0916698497028 145:0.0916698497028 146:0.0916698497028
147:0.0916698497028 148:0.0916698497028 149:0.0916698497028
150:0.0916698497028 151:0.0916698497028 152:0.0916698497028
153:0.0916698497028 154:0.0916698497028 155:0.0916698497028
156:0.0916698497028 157:0.0916698497028 158:0.0916698497028
159:0.0916698497028 160:0.0916698497028 161:0.0916698497028
162:0.0916698497028 163:0.0916698497028 164:0.0916698497028
165:0.0916698497028 166:0.0916698497028 167:0.0916698497028
168:0.0916698497028 169:0.0916698497028 170:0.0916698497028
171:0.0916698497028 172:0.0916698497028 173:0.0916698497028
174:0.0916698497028 175:0.0916698497028 176:0.0916698497028
177:0.0916698497028 178:0.0916698497028 179:0.0916698497028
180:0.0916698497028 181:0.0916698497028 182:0.0916698497028
183:0.0916698497028 184:0.0916698497028 185:0.0916698497028
186:0.0916698497028 187:0.0916698497028 188:0.0916698497028
189:0.0916698497028 190:0.0916698497028 191:0.0916698497028
192:0.0916698497028 193:0.0916698497028 194:0.0916698497028
How can I convert it into the regular dataset?
Any other method than read.delim for reading ".svm" file in R will also be helpful.
Maybe the solution contains a number of loops. But it solved my problem.
Below is the R-code :
rm(list=ls())
data <- read.delim(file.choose(),header=F)
# Now using strsplit function to create a regular dataser
temp <- list()
for(i in 1:length(data$V1)){
temp[i] <- strsplit(as.character(data$V1[i]),c(" "))
}
response <- list()
for(i in 1:length(temp)){
response[[i]] <- as.numeric(strsplit(temp[[i]][1],",")[[1]])
}
# Now working for responses
l.response <- 0
for (i in 1:length(response)){
l.response[i] <- length(response[[i]])
}
col.names <- paste(rep("R",22),1:22,sep="")
l.r <- length(temp)
df.response <- data.frame(R1=rep(0,l.r),R2=rep(0,l.r),R3=rep(0,l.r),R4=rep(0,l.r),R5=rep(0,l.r)
,R6=rep(0,l.r),R7=rep(0,l.r),R8=rep(0,l.r),R9=rep(0,l.r),R10=rep(0,l.r)
,R11=rep(0,l.r),R12=rep(0,l.r),R13=rep(0,l.r),R14=rep(0,l.r),R15=rep(0,l.r)
,R16=rep(0,l.r),R17=rep(0,l.r),R18=rep(0,l.r),R19=rep(0,l.r),R20=rep(0,l.r)
,R21=rep(0,l.r),R22=rep(0,l.r))
for(i in 1:length(response)){
df.response[i,(response[[i]]+1)] <- 1
}
feature <- c(0)
value <- c(0)
v.l <- 21519
v.list <- list()
list.name <- paste(rep("V",v.l),1:v.l,sep="")
f.vec <- 0
v.vec <- 0
for(i in 1:length(temp)){
for(j in 2:length(temp[[i]])){
f.vec[j-1] <- as.numeric(strsplit(temp[[i]][j],":")[[1]])[1]
v.vec[j-1] <- as.numeric(strsplit(temp[[i]][j],":")[[1]])[2]
}
v.list[[i]] <- data.frame(f.vec,v.vec)
}
feature.name <- paste(rep("V",30438),1:30438,sep="")
v.l <- 21519
variables <- data.frame(temp = rep(0,v.l))
for(i in 1:length(feature.name)){
variables[,feature.name[i]] <- rep(0,v.l)
}
variables <- variables[,-1]
copy.variables <- variables
for(i in 1:100){
pos <- v.list[[i]][,"f.vec"]
replace <- v.list[[i]][,"v.vec"]
if(length(unique(pos))!=length(pos)){
repeat{
uni <- as.numeric(attr(which(table(pos)>1), "names"))
for(k in 1:length(uni)){
t.pos <- which(pos==uni[k])
pos <- pos[-t.pos[1]]
replace <- replace[-t.pos[1]]
}
if(length(unique(pos))==length(pos)) break
}
}
variables[i,pos]<- replace
}
dim(df.response)
dim(variables)
Below code will give the final data with 100 rows and 100 columns.
final.data <- cbind(variables[1:100,],df.response[1:100,])
Welcome for other solutions. #LenGreski

Create and combine 2 grid.tables

I have created a grid.table object to display a dataframe in PowerBi, below there is my code:
dataset <- data.frame(BDS_ID = c("001","002"),
PRIORITY = c("high","medium"),
STATUS = c("onair","onair"),
COMPANY = c("airfr","fly"))
my.result <- melt(dataset, id = c("BDS_ID"))
mytheme <- ttheme_default(base_size = 10,
core=list(fg_params=list(hjust=0, x=0.01),
bg_params=list(fill=c("white", "grey90"))))
for (i in 1:nrow(tg)) {
tg$grobs[[i]] <- editGrob(tg$grobs[[i]], gp=gpar(fontface="bold"))
}
grid.draw(tg)
and this is my output:
I would like to improve my output in the following way: I would like that the row headers to be unique and have a different column for each different value of each variable repeating the column with the row headers each time.
I tried to do this using the statement t(dataset), but I do not get the desired result because the row headers are not repeated.
I would like to get an output (always classy grob) similar to this:
**PRIORITY** high **PRIORITY** medium
**STATUS** onair **STATUS** onair
**COMPANY** airfr **COMPANY** fly
Does anyone knows how to achive this?
Thanks
I'm unable to reproduce the grob format you've shown based on the code you've provided, but I've got something similar:
dataset <- data.frame(BDS_ID = c("001","002"),
PRIORITY = c("high","medium"),
STATUS = c("onair","onair"),
COMPANY = c("airfr","fly"))
dataset <- data.frame(t(dataset))
dataset$label1 <- rownames(dataset)
dataset$label2 <- rownames(dataset)
colnames(dataset) <- c("status1", "status2", "label1", "label2")
dataset <- dataset[c(2:nrow(dataset)), c(3, 1, 4, 2)]
rownames(dataset) <- NULL
test <- grid.draw(tableGrob(dataset))
The above code produces the following object. It doesn't look exactly like yours, but it's in the general structure you're looking for:

How to use a script on multiple excel working sheets in r studio

I have a rather long script, at least for me, but i'm new in R.
It looks like this
#DATA IS LOADED
library(xlsx)
df=read.xlsx2("/Users/emiliemariafalkkallenbach/Documents/Norsk_Institut_for_vandforskning/Chase_DK.xlsx",sheetIndex=1)
df[df==""]<- NA
df=data.matrix(df)
df=data.frame(df)
attach(df)
Phyto=df[,1:6]
Submerged=df[,7:12]
Benthic=df[13:18]
# PHTYOPLANKTON
#making of boundaries
a<-(ifelse(P_Resp<=0,(1-P_Acdev),1/(1+P_Acdev)))
b=(0.95+a)/2
c=2*a-b
d=2*c-a
#Weighted boundaries
e=0.95*P_weight
e=sum(e)
f=b*P_weight
f=sum(f)
g=a*P_weight
g=sum(g)
h=c*P_weight
g=sum(g)
i=d*P_weight
i=sum(i)
#EQR weighted
j=P_EQR*P_weight
j=sum(j)
#complete
l <- ifelse(is.na(rowSums(Phyto)),1,0)
m=sum(l)
#SUBMERGED VEGETATION
#Overall assesment
y=ifelse(m<=0,j,"")
z=ifelse(y!="NA",0,(ifelse(y>f,1,(ifelse(y>g,2,(ifelse(y>h,3,(ifelse(y>i,4,5)))))))))
#making of boundaries
a.sub<-(ifelse(S_Resp<=0,(1-S_Acdev),1/(1+S_Acdev)))
a.sub=sum(a.sub)
b.sub=(0.95+a.sub)/2
b.sub=sum(b.sub)
c.sub=2*a.sub-b.sub
c.sub=sum(c.sub)
d.sub=2*c.sub-a.sub
d.sub=sum(d.sub)
#Weighted boundaries
e.sub=0.95*S_weight
e.sub=sum(e)
f.sub=b.sub*S_weight
f.sub=sum(f)
g.sub=a.sub*S_weight
g.sub=sum(g)
h.sub=c.sub*S_weight
h.sub=sum(h)
i.sub=d.sub*S_weight
i.sub=sum(i)
#EQR.sub weighted
j.sub=S_EQR*S_weight
j.sub=sum(j.sub)
#complete.sub
l.sub <- ifelse(is.na(rowSums(Submerged)),1,0)
m.sub=sum(l.sub)
#Overall Assesment
q.sub=m.sub*0.75
y.sub=ifelse(m.sub<=0,j.sub,"")
z.sub=ifelse(y.sub!="NA",(ifelse(y.sub>f.sub,1,(ifelse(y.sub>g.sub,2,(ifelse(y.sub>h.sub,3,(ifelse(y.sub>i.sub,4,5)))))))),0)
BENTHIC INVERTEBRATES
#making of boundaries
a.ben<-(ifelse(B_Resp<=0,(1-B_Acdev),1/(1+B_Acdev)))
b.ben=(0.95+a.ben)/2
c.ben=2*a.ben-b.ben
d.ben=2*c.ben-a.ben
#Weighted boundaries
e.ben=0.95*B_weight
e.ben=sum(e.ben)
f.ben=b.ben*B_weight
f.ben=sum(f.ben)
g.ben=a.ben*B_weight
g.ben=(sum(g.ben))
h.ben=c.ben*B_weight
h.ben=sum(h.ben)
i.ben=d.ben*B_weight
i.ben=sum(i.ben)
#EQR weighted
j.ben=B_EQR*B_weight
#Complete
l.ben <- ifelse(is.na(rowSums(Benthic)),1,0)
m.ben=sum(l.sub)
#ChkAccDev
n.ben=ifelse(B_Resp>0,0.53,1.1)
o.ben=ifelse(B_Acdev<0.15,-1,0)
p.ben=ifelse(B_Acdev>n.ben,1,o.ben)
#Overall Assesment
q.ben=m.ben*0.75
y.ben=ifelse(m.ben<1,j.ben,"")
z.ben=ifelse(y.ben!="NA",ifelse(y.ben>f.ben,1,(ifelse(y.ben>g.ben,2,(ifelse(y.ben>h.ben,3,(ifelse(y.ben>i.ben,4,5))))))),0)
#Final assesment
Z=max(na.omit(c(z,z.sub,z.ben)))
#Overall assesment
SCORE=(ifelse(Z<=1 && Z<2,"HIGH",(ifelse(Z>=2 && Z<3,"GOOD",(ifelse(Z>=3 && Z<4,"MODERATE",(ifelse(Z>=4 && Z<5,"BAD",(ifelse(Z>=5,"POOR","NA"))))))))))
SCORE
#Pie Chart
library('plotrix')
Phyto=33.3
Submerged=33.3
Benthic=33.3
total=100
Slices1=c=(1)
total=100
iniR=0.2
Slices1=c=(1)
pie(1, radius=iniR, init.angle=90, col=c('white'), border = NA, labels='Overall')
colors=c(ifelse(z<=1,"blue", ifelse(z<=2,"green",ifelse(z<=3,"yellow",ifelse(z<=4,"orange",ifelse(z<=5,"red","red"))))), ifelse(z.sub<=1,"blue", ifelse(z.sub<=2,"green", ifelse(z.sub<=3,"yellow",ifelse(z.sub<=4,"orange",ifelse(z.sub<=5,"red","red"))))),ifelse(z.ben<=1,"blue", ifelse(z.ben<=2,"green", ifelse(z.ben<=3, "yellow", ifelse(z.ben<=4,"orange",ifelse(z.ben<=5,"red","red"))))))
floating.pie(0,0,c(Phyto, Submerged, Benthic),radius=5*iniR, startpos=pi/2, col=colors,border=NA)
Slices1=c=(1)
total=100
iniR=0.2 # initial radius
colorst=c(ifelse(Z<=1,"blue", ifelse(Z<=2,"green",ifelse(Z<=3,"yellow",ifelse(Z<=4,"orange",ifelse(Z<=5,"red","red"))))))
floating.pie(0,0,c(total),radius=3*iniR, startpos=pi/2, col=colorst,border=NA)
angles=as.numeric(c(-10,75,80))
pie.labels(0,0,angles,c("Phyto","Submerged","Benthic"),radius=1, bg="white")
pie.labels(0,0,0,c("Overall Assessment"),radius=-0.3)
I guess it does not matter, what is into my script.
At the moment it only runs the first sheet in excel, but I have several and would like to run them on all of them.
The outcome should be a table looking like this
z z.sub z.ben Z Pie-chart (only if possible)
Sheet 1 0 NA NA High
Sheet 2 ... ... ... ...
Sheet 3 ... ... ... ...
I'm sorry if this is an ordinary question!
Hope someone is able to help
Thanks!
a better way to read xls is the library readxl.
# remove "#" if you don't have these libraries installed already
# install.packages("readxl") # faster excel reader
# install.packages("data.table") # faster everything, in this case rbindlist
library(readxl)
library(data.table)
sheets = 1:5 # index numbers or names of the sheets you want to read
readmysheets = function(sheets) {
df = read_excel(file="myexcel.xls", sheets)
}
myfiles = lapply(sheets, readmysheets) # apply the indices/names on the readmysheets function
# you now have a list:
str(myfiles)
# bind the separate sheets together
together = rbindlist(myfiles, fill = T)
Wrap the body of your script into a "for" loop. You have several was of doing this, here are two.
# pre-allocate an object to write to
out <- matrix(rep(NA, numsheets * numcols), ncols = numcols))
for (i in 1:nsheets) {
df=read.xlsx2(".../Chase_DK.xlsx", sheetIndex = i)
#... do calculations
out[, i] <- c(z, z.sub, z.ben, Z)
}
# second way, no need to pre-allocate anything
sapply(1:nsheets, FUN = function(x) {
df=read.xlsx2(".../Chase_DK.xlsx", sheetIndex = x)
#... do calculations
out <- c(z, z.sub, z.ben, Z) # specify what you wish the function to return
return(out) # sapply will try to simplify the combined result on its own
})

Huge data file and running multiple parameters and memory issue, Fisher's test

I have a R code that I am trying to run in a server. But it is stopping in the middle/get frozen probably because of memory limitation. The data files are huge/massive (one has 20 million lines) and if you look at the double for loop in the code, length(ratSplit) = 281 and length(humanSplit) = 36. The data has specific data of human and rats' genes and human has 36 replicates, while rat has 281. So, the loop is basically 281*36 steps. What I want to do is to process data using the function getGeneType and see how different/independent are the expression of different replicate combinations. Using Fisher's test. The data rat_processed_7_25_FDR_05.out looks like this :
2 Sptbn1 114201107 114200202 chr14|Sptbn1:114201107|Sptbn1:114200202|reg|- 2 Thymus_M_GSM1328751 reg
2 Ndufb7 35680273 35683909 chr19|Ndufb7:35680273|Ndufb7:35683909|reg|+ 2 Thymus_M_GSM1328751 rev
2 Ndufb10 13906408 13906289 chr10|Ndufb10:13906408|Ndufb10:13906289|reg|- 2 Thymus_M_GSM1328751 reg
3 Cdc14b 1719665 1719190 chr17|Cdc14b:1719665|Cdc14b:1719190|reg|- 3 Thymus_M_GSM1328751 reg
and the data fetal_output_7_2.out has the form
SPTLC2 78018438 77987924 chr14|SPTLC2:78018438|SPTLC2:77987924|reg|- 11 Fetal_Brain_408_AGTCAA_L006_R1_report.txt reg
EXOSC1 99202993 99201016 chr10|EXOSC1:99202993|EXOSC1:99201016|rev|- 5 Fetal_Brain_408_AGTCAA_L006_R1_report.txt reg
SHMT2 57627893 57628016 chr12|SHMT2:57627893|SHMT2:57628016|reg|+ 8 Fetal_Brain_408_AGTCAA_L006_R1_report.txt reg
ZNF510 99538281 99537128 chr9|ZNF510:99538281|ZNF510:99537128|reg|- 8 Fetal_Brain_408_AGTCAA_L006_R1_report.txt reg
PPFIBP1 27820253 27824363 chr12|PPFIBP1:27820253|PPFIBP1:27824363|reg|+ 10 Fetal_Brain_408_AGTCAA_L006_R1_report.txt reg
Now I have few questions on how to make this more efficient. I think when I run this code, R takes up lots of memory that ultimately causes problems. I am wondering if there is any way of doing this more efficiently
Another possibility is the usage of double for-loop'. Will sapply help? In that case, how should I apply sapply?
At the end I want to convert result into a csv file. I know this is a bit overwhelming to put code like this. But any optimization/efficient coding/programming will be A LOT! I really need to run the whole thing at least one to get the data soon.
#this one compares reg vs rev
date()
ratRawData <- read.table("rat_processed_7_25_FDR_05.out",col.names = c("alignment", "ratGene", "start", "end", "chrom", "align", "ratReplicate", "RNAtype"), fill = TRUE)
humanRawData <- read.table("fetal_output_7_2.out", col.names = c("humanGene", "start", "end", "chrom", "alignment", "humanReplicate", "RNAtype"), fill = TRUE)
geneList <- read.table("geneList.txt", col.names = c("human", "rat"), sep = ',')
#keeping only information about gene, alignment number, replicate and RNAtype, discard other columns
ratRawData <- ratRawData[,c("ratGene", "ratReplicate", "alignment", "RNAtype")]
humanRawData <- humanRawData[, c( "humanGene", "humanReplicate", "alignment", "RNAtype")]
#function to capitalize
capitalize <- function(x){
capital <- toupper(x) ## capitalize
paste0(capital)
}
#capitalizing the rna type naming for rat. So, reg ->REG, dup ->DUP, rev ->REV
#doing this to make data manipulation for making contingency table easier.
levels(ratRawData$RNAtype) <- capitalize(levels(ratRawData$RNAtype))
#spliting data in replicates
ratSplit <- split(ratRawData, ratRawData$ratReplicate)
humanSplit <- split(humanRawData, humanRawData$humanReplicate)
print("done splitting")
#HyRy :when some gene has only reg, rev , REG, REV
#HnRy : when some gene has only reg,REG,REV
#HyRn : add 1 when some gene has only reg,rev,REG
#HnRn : add 1 when some gene has only reg,REG
#function to be used to aggregate
getGeneType <- function(types) {
types <- as.character(types)
if ('rev' %in% types) {
return(ifelse(('REV' %in% types), 'HyRy', 'HyRn'))
}
else {
return(ifelse(('REV' %in% types), 'HnRy', 'HnRn'))
}
}
#logical function to see whether x is integer(0) ..It's used the for loop bellow in case any one HmYn is equal to zero
is.integer0 <- function(x) {
is.integer(x) && length(x) == 0L
}
result <- data.frame(humanReplicate = "human_replicate", ratReplicate = "rat_replicate", pvalue = "p-value", alternative = "alternative_hypothesis",
Conf.int1 = "conf.int1", Conf.int2 ="conf.int2", oddratio = "Odd_Ratio")
for(i in 1:length(ratSplit)) {
for(j in 1:length(humanSplit)) {
ratReplicateName <- names(ratSplit[i])
humanReplicateName <- names(humanSplit[j])
#merging above two based on the one-to-one gene mapping as in geneList defined above.
mergedHumanData <-merge(geneList,humanSplit[[j]], by.x = "human", by.y = "humanGene")
mergedRatData <- merge(geneList, ratSplit[[i]], by.x = "rat", by.y = "ratGene")
mergedHumanData <- mergedHumanData[,c(1,2,4,5)] #rearrange column
mergedRatData <- mergedRatData[,c(2,1,4,5)] #rearrange column
mergedHumanRatData <- rbind(mergedHumanData,mergedRatData) #now the columns are "human", "rat", "alignment", "RNAtype"
agg <- aggregate(RNAtype ~ human+rat, data= mergedHumanRatData, FUN=getGeneType) #agg to make HmYn form
HmRnTable <- table(agg$RNAtype) #table of HmRn ie RNAtype in human and rat.
#now assign these numbers to variables HmYn. Consider cases when some form of HmRy is not present in the table. That's why
#is.integer0 function is used
HyRy <- ifelse(is.integer0(HmRnTable[names(HmRnTable) == "HyRy"]), 0, HmRnTable[names(HmRnTable) == "HyRy"][[1]])
HnRn <- ifelse(is.integer0(HmRnTable[names(HmRnTable) == "HnRn"]), 0, HmRnTable[names(HmRnTable) == "HnRn"][[1]])
HyRn <- ifelse(is.integer0(HmRnTable[names(HmRnTable) == "HyRn"]), 0, HmRnTable[names(HmRnTable) == "HyRn"][[1]])
HnRy <- ifelse(is.integer0(HmRnTable[names(HmRnTable) == "HnRy"]), 0, HmRnTable[names(HmRnTable) == "HnRy"][[1]])
contingencyTable <- matrix(c(HnRn,HnRy,HyRn,HyRy), nrow = 2)
# contingencyTable:
# HnRn --|--HyRn
# |------|-----|
# HnRy --|-- HyRy
#
fisherTest <- fisher.test(contingencyTable)
#make new line out of the result of fisherTest
newLine <- data.frame(t(c(humanReplicate = humanReplicateName, ratReplicate = ratReplicateName, pvalue = fisherTest$p,
alternative = fisherTest$alternative, Conf.int1 = fisherTest$conf.int[1], Conf.int2 =fisherTest$conf.int[2],
oddratio = fisherTest$estimate[[1]])))
result <-rbind(result,newLine) #append newline to result
if(j%%10 = 0) print(c(i,j))
}
}
write.table(result, file = "compareRegAndRev.csv", row.names = FALSE, append = FALSE, col.names = TRUE, sep = ",")
Referring to the accepted answer to Monitor memory usage in R, the amount of memory used by R can be tracked with gc().
If the script is, indeed, running short of memory (which would not surprise me), the easiest way to resolve the problem would be to move the write.table() from the outside to the inside of the loop, to replace the rbind(). It would just be necessary to create a new file name for the CSV file that is written from each output, e.g. by:
csvFileName <- sprintf("compareRegAndRev%03d_%03d.csv",i,j)
If the CSV files are written without headers, they could then be concatenated separately outside R (e.g. using cat in Unix) and the header added later.
While this approach might succeed in creating the CSV file that is sought, it is possible that file might be too big to process subsequently. If so, it may be preferable to process the CSV files individually, rather than concatenating them at all.

Resources