TFBS search with TFBSTools and a for loop - r

I am currently looking for TFBS motifs in DNA. The original code I wrote to look for 10 different transcription factor motifs worked beautifully. But now I want to search for more than 100 motifs. Therefore I want to use loops so that I dont have to type in all the motif names manually (and can easily change my list if my interests change). But when I do this my code no longer works. I can print the result of the loops and it looks like the list contains all the correct information, but when I then use this list to compare with my DNA the list seems to only remember the last item of the list.
Here is my original code that works:
initializeJASPARDB("jaspar.sqlite", version="2014")
MA0018.2 <- getMatrixByID(db, ID="MA0018.2")
MA0024.1 <- getMatrixByID(db, ID="MA0024.1")
MA0062.1 <- getMatrixByID(db, ID="MA0062.1")
MA0099.2 <- getMatrixByID(db, ID="MA0099.2")
MA0112.1 <- getMatrixByID(db, ID="MA0112.1")
MA0153.1 <- getMatrixByID(db, ID="MA0153.1")
MA0156.1 <- getMatrixByID(db, ID="MA0156.1")
MA0157.1 <- getMatrixByID(db, ID="MA0157.1")
MA0158.1 <- getMatrixByID(db, ID="MA0158.1")
MA0259.1 <- getMatrixByID(db, ID="MA0259.1")
pwm1 <- toPWM(MA0018.2)
pwm2 <- toPWM(MA0024.1)
pwm3 <- toPWM(MA0062.1)
pwm4 <- toPWM(MA0099.2)
pwm5 <- toPWM(MA0112.1)
pwm6 <- toPWM(MA0153.1)
pwm7 <- toPWM(MA0156.1)
pwm8 <- toPWM(MA0157.1)
pwm9 <- toPWM(MA0158.1)
pwm10 <- toPWM(MA0259.1)
pwmList <- PWMatrixList(pwm1=pwm1, pwm2=pwm2, pwm3=pwm3, pwm4=pwm4, pwm5=pwm5, pwm6=pwm6, pwm7=pwm7, pwm8=pwm8, pwm9=pwm9, pwm10=pwm10)
seq3 <- readDNAStringSet(file = "sequences/HBV/HBV_genotypeEplusFplusGplusHplusRF.fasta")
sitesetList <- searchSeq(pwmList, seq3, min.score="90%")
as(sitesetList, "data.frame")
as(sitesetList, "DataFrame")
as(sitesetList, "GRanges")
#writeGFF3(sitesetList)
Viruses_TFBS <- writeGFF2(sitesetList)
write.csv(Viruses_TFBS, file = "analysis_HBV/HBV_combmanually_10TFBS.csv")
head(Viruses_TFBS, 5)
Here is my new code with the loop:
TFBS_motifs <- read.csv("TFBS_motifs_test.csv")
seqHSV <- readDNAStringSet(file = "sequences/HSV1.fasta")
# for loop objects
TFBS_matrices <- list()
pwms <- list()
TFBS_ID <- TFBS_motifs$ID
for (i in 1:nrow(TFBS_motifs)) {
pwms[[i]] <- toPWM(TFBS_matrices[[i]] <- getMatrixByID(db, ID = TFBS_ID[[i]]))
}
#print(pwms)
pwmList <- PWMatrixList(pwms[[i]])
sitesetList <- searchSeq(pwmList, seqHSV, min.score="90%")
as(sitesetList, "data.frame")
as(sitesetList, "DataFrame")
as(sitesetList, "GRanges")
#writeGFF3(sitesetList)
HSV_TFBS <- writeGFF2(sitesetList)
write.csv(HSV_TFBS, file = "analysis_HBV/HSV_test.csv")
Thank you all so much for any suggestions...

You don't really need to run this in a loop; most of these commands can deal with lists. Try something like this:
library(TFBSTools)
library(JASPAR2018) # or use 2014 version if you prefer
library(Biostrings)
db <- file.path(system.file("extdata", package="JASPAR2018"), "JASPAR2018.sqlite")
# provide your motifs here or read in from file
mymotifs <- c("MA0018.2", "MA0024.1", "MA0062.1", "MA0099.2")
PFMatrixList <- getMatrixByID(db, ID=mymotifs)
# provide your genome fasta file here
HSV1 <- readDNAStringSet("ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/fkj/FKJZ01.fasta.gz")
sitesetList <- searchSeq(toPWM(PFMatrixList), HSV1, min.score="90%")
# generate GRangesList (and unlist if you prefer a single GRanges object)
unlist(as(sitesetList, "GRangesList"))

Related

Error in do.ply(i) : task 1 failed - "could not find function "%>%"" in R parallel programming

Every time I run the script it always gives me an error: Error in { : task 1 failed - "could not find function "%>%""
I already check every post on this forum and tried to apply it but no one works.
Please advise any solution.
Please note: I have only 2 cores on my PC.
My code is as follows:
library(dplyr) # For basic data manipulation
library(ncdf4) # For creating NetCDF files
library(tidync) # For easily dealing with NetCDF data
library(ggplot2) # For visualising data
library(doParallel) # For parallel processing
MHW_res_grid <- readRDS("C:/Users/SUDHANSHU KUMAR/Desktop/MTech Project/R/MHW_result.Rds")
# Function for creating arrays from data.frames
df_acast <- function(df, lon_lat){
# Force grid
res <- df %>%
right_join(lon_lat, by = c("lon", "lat")) %>%
arrange(lon, lat)
# Convert date values to integers if they are present
if(lubridate::is.Date(res[1,4])) res[,4] <- as.integer(res[,4])
# Create array
res_array <- base::array(res[,4], dim = c(length(unique(lon_lat$lon)), length(unique(lon_lat$lat))))
dimnames(res_array) <- list(lon = unique(lon_lat$lon),
lat = unique(lon_lat$lat))
return(res_array)
}
# Wrapper function for last step before data are entered into NetCDF files
df_proc <- function(df, col_choice){
# Determine the correct array dimensions
lon_step <- mean(diff(sort(unique(df$lon))))
lat_step <- mean(diff(sort(unique(df$lat))))
lon <- seq(min(df$lon), max(df$lon), by = lon_step)
lat <- seq(min(df$lat), max(df$lat), by = lat_step)
# Create full lon/lat grid
lon_lat <- expand.grid(lon = lon, lat = lat) %>%
data.frame()
# Acast only the desired column
dfa <- plyr::daply(df[c("lon", "lat", "event_no", col_choice)],
c("event_no"), df_acast, .parallel = T, lon_lat = lon_lat)
return(dfa)
}
# We must now run this function on each column of data we want to add to the NetCDF file
doParallel::registerDoParallel(cores = 2)
prep_dur <- df_proc(MHW_res_grid, "duration")
prep_max_int <- df_proc(MHW_res_grid, "intensity_max")
prep_cum_int <- df_proc(MHW_res_grid, "intensity_cumulative")
prep_peak <- df_proc(MHW_res_grid, "date_peak")

svm file handling in R

I have a multi-label classification problem. I have a dataset available at the following link:
dataset
This data set is originally from siam competition 2007. The dataset comprises of aviation safety reports describing the problem(s) which occurred in certain flights. It is a multi-classification, high dimensional problem. It has 21519 rows and 30438 columns.
The dataset contains .svm format file.
I have read the file with the help of "read.delim" in R.
After that I got following output:
head(data[,1])
1 18 2:0.136082763488 6:0.136082763488 7:0.136082763488
12:0.136082763488 20:0.136082763488 23:0.136082763488
32:0.136082763488 37:0.136082763488 39:0.136082763488
43:0.136082763488 53:0.136082763488 57:0.136082763488
58:0.136082763488 59:0.136082763488 60:0.136082763488
61:0.136082763488 62:0.136082763488 63:0.136082763488
64:0.136082763488 65:0.136082763488 66:0.136082763488
67:0.136082763488 68:0.136082763488 69:0.136082763488
70:0.136082763488 71:0.136082763488 72:0.136082763488
73:0.136082763488 74:0.136082763488 75:0.136082763488
76:0.136082763488 77:0.136082763488 78:0.136082763488
79:0.136082763488 80:0.136082763488 81:0.136082763488
82:0.136082763488 83:0.136082763488 84:0.136082763488
85:0.136082763488 86:0.136082763488 87:0.136082763488
88:0.136082763488 89:0.136082763488 90:0.136082763488
91:0.136082763488 92:0.136082763488 93:0.136082763488
94:0.136082763488 95:0.136082763488 96:0.136082763488
97:0.136082763488 98:0.136082763488 99:0.136082763488
[2] 1,12,13,18,20 2:0.0916698497028 4:0.0916698497028
6:0.0916698497028 12:0.0916698497028 14:0.0916698497028
16:0.0916698497028 19:0.0916698497028 23:0.0916698497028
26:0.0916698497028 31:0.0916698497028 32:0.0916698497028
33:0.0916698497028 37:0.0916698497028 53:0.0916698497028
57:0.0916698497028 66:0.0916698497028 71:0.0916698497028
72:0.0916698497028 81:0.0916698497028 83:0.0916698497028
84:0.0916698497028 86:0.0916698497028 90:0.0916698497028
92:0.0916698497028 100:0.0916698497028 101:0.0916698497028
102:0.0916698497028 103:0.0916698497028 104:0.0916698497028
105:0.0916698497028 106:0.0916698497028 107:0.0916698497028
108:0.0916698497028 109:0.0916698497028 110:0.0916698497028
111:0.0916698497028 112:0.0916698497028 113:0.0916698497028
114:0.0916698497028 115:0.0916698497028 116:0.0916698497028
117:0.0916698497028 118:0.0916698497028 119:0.0916698497028
120:0.0916698497028 121:0.0916698497028 122:0.0916698497028
123:0.0916698497028 124:0.0916698497028 125:0.0916698497028
126:0.0916698497028 127:0.0916698497028 128:0.0916698497028
129:0.0916698497028 130:0.0916698497028 131:0.0916698497028
132:0.0916698497028 133:0.0916698497028 134:0.0916698497028
135:0.0916698497028 136:0.0916698497028 137:0.0916698497028
138:0.0916698497028 139:0.0916698497028 140:0.0916698497028
141:0.0916698497028 142:0.0916698497028 143:0.0916698497028
144:0.0916698497028 145:0.0916698497028 146:0.0916698497028
147:0.0916698497028 148:0.0916698497028 149:0.0916698497028
150:0.0916698497028 151:0.0916698497028 152:0.0916698497028
153:0.0916698497028 154:0.0916698497028 155:0.0916698497028
156:0.0916698497028 157:0.0916698497028 158:0.0916698497028
159:0.0916698497028 160:0.0916698497028 161:0.0916698497028
162:0.0916698497028 163:0.0916698497028 164:0.0916698497028
165:0.0916698497028 166:0.0916698497028 167:0.0916698497028
168:0.0916698497028 169:0.0916698497028 170:0.0916698497028
171:0.0916698497028 172:0.0916698497028 173:0.0916698497028
174:0.0916698497028 175:0.0916698497028 176:0.0916698497028
177:0.0916698497028 178:0.0916698497028 179:0.0916698497028
180:0.0916698497028 181:0.0916698497028 182:0.0916698497028
183:0.0916698497028 184:0.0916698497028 185:0.0916698497028
186:0.0916698497028 187:0.0916698497028 188:0.0916698497028
189:0.0916698497028 190:0.0916698497028 191:0.0916698497028
192:0.0916698497028 193:0.0916698497028 194:0.0916698497028
How can I convert it into the regular dataset?
Any other method than read.delim for reading ".svm" file in R will also be helpful.
Maybe the solution contains a number of loops. But it solved my problem.
Below is the R-code :
rm(list=ls())
data <- read.delim(file.choose(),header=F)
# Now using strsplit function to create a regular dataser
temp <- list()
for(i in 1:length(data$V1)){
temp[i] <- strsplit(as.character(data$V1[i]),c(" "))
}
response <- list()
for(i in 1:length(temp)){
response[[i]] <- as.numeric(strsplit(temp[[i]][1],",")[[1]])
}
# Now working for responses
l.response <- 0
for (i in 1:length(response)){
l.response[i] <- length(response[[i]])
}
col.names <- paste(rep("R",22),1:22,sep="")
l.r <- length(temp)
df.response <- data.frame(R1=rep(0,l.r),R2=rep(0,l.r),R3=rep(0,l.r),R4=rep(0,l.r),R5=rep(0,l.r)
,R6=rep(0,l.r),R7=rep(0,l.r),R8=rep(0,l.r),R9=rep(0,l.r),R10=rep(0,l.r)
,R11=rep(0,l.r),R12=rep(0,l.r),R13=rep(0,l.r),R14=rep(0,l.r),R15=rep(0,l.r)
,R16=rep(0,l.r),R17=rep(0,l.r),R18=rep(0,l.r),R19=rep(0,l.r),R20=rep(0,l.r)
,R21=rep(0,l.r),R22=rep(0,l.r))
for(i in 1:length(response)){
df.response[i,(response[[i]]+1)] <- 1
}
feature <- c(0)
value <- c(0)
v.l <- 21519
v.list <- list()
list.name <- paste(rep("V",v.l),1:v.l,sep="")
f.vec <- 0
v.vec <- 0
for(i in 1:length(temp)){
for(j in 2:length(temp[[i]])){
f.vec[j-1] <- as.numeric(strsplit(temp[[i]][j],":")[[1]])[1]
v.vec[j-1] <- as.numeric(strsplit(temp[[i]][j],":")[[1]])[2]
}
v.list[[i]] <- data.frame(f.vec,v.vec)
}
feature.name <- paste(rep("V",30438),1:30438,sep="")
v.l <- 21519
variables <- data.frame(temp = rep(0,v.l))
for(i in 1:length(feature.name)){
variables[,feature.name[i]] <- rep(0,v.l)
}
variables <- variables[,-1]
copy.variables <- variables
for(i in 1:100){
pos <- v.list[[i]][,"f.vec"]
replace <- v.list[[i]][,"v.vec"]
if(length(unique(pos))!=length(pos)){
repeat{
uni <- as.numeric(attr(which(table(pos)>1), "names"))
for(k in 1:length(uni)){
t.pos <- which(pos==uni[k])
pos <- pos[-t.pos[1]]
replace <- replace[-t.pos[1]]
}
if(length(unique(pos))==length(pos)) break
}
}
variables[i,pos]<- replace
}
dim(df.response)
dim(variables)
Below code will give the final data with 100 rows and 100 columns.
final.data <- cbind(variables[1:100,],df.response[1:100,])
Welcome for other solutions. #LenGreski

Loop to Create and save dataframes into list

I have example code and data below. What I'm trying to figure out is how write a loop that would create say x (in this example x=3) dataframes from a dataframe (in this example datadf) and save those dataframes in a list. The main part I'm stuck on is how to save each dataframe into a list. Any tips are greatly appreciated.
The updated code below seems to just about work, except the beginning index on the dataframes always stays at 1, instead of stepping 10 ahead each time. Anybody know what the issue is?
Update:
N<-3
x<-vector("list",N)
for (i in 1:N)
{
a<-(1:100)*rnorm(1,0.5)
b<-(1:100)*rnorm(1,2)
datadf<-as.data.frame(cbind(a,b))
n<-10
t<-50
datadfn<-datadf[((i-1)*n+1):(t+2*(i-1)*n),]
x[[i]]<-datadfn
}
Example Code:
n<-10
t<-50
datadf1<-datadf[1:t,]
datadf2<-datadf[(n+1):(t+n),]
datadf3<-datadf[(2*n+1):(t+2*n),]
dfList<-list(datadf1, datadf2, datadf3)
Data:
dput(datadf)
structure(list(a = c(2.00134717160119, 4.00269434320238, 6.00404151480358,
8.00538868640477, 10.006735858006, 12.0080830296072, 14.0094302012083,
16.0107773728095, 18.0121245444107, 20.0134717160119, 22.0148188876131,
24.0161660592143, 26.0175132308155, 28.0188604024167, 30.0202075740179,
32.0215547456191, 34.0229019172203, 36.0242490888215, 38.0255962604226,
40.0269434320238, 42.028290603625, 44.0296377752262, 46.0309849468274,
48.0323321184286, 50.0336792900298, 52.035026461631, 54.0363736332322,
56.0377208048334, 58.0390679764346, 60.0404151480358, 62.041762319637,
64.0431094912381, 66.0444566628393, 68.0458038344405, 70.0471510060417,
72.0484981776429, 74.0498453492441, 76.0511925208453, 78.0525396924465,
80.0538868640477, 82.0552340356489, 84.0565812072501, 86.0579283788513,
88.0592755504524, 90.0606227220536, 92.0619698936548, 94.063317065256,
96.0646642368572, 98.0660114084584, 100.06735858006, 102.068705751661,
104.070052923262, 106.071400094863, 108.072747266464, 110.074094438066,
112.075441609667, 114.076788781268, 116.078135952869, 118.07948312447,
120.080830296072, 122.082177467673, 124.083524639274, 126.084871810875,
128.086218982476, 130.087566154077, 132.088913325679, 134.09026049728,
136.091607668881, 138.092954840482, 140.094302012083, 142.095649183685,
144.096996355286, 146.098343526887, 148.099690698488, 150.101037870089,
152.102385041691, 154.103732213292, 156.105079384893, 158.106426556494,
160.107773728095, 162.109120899697, 164.110468071298, 166.111815242899,
168.1131624145, 170.114509586101, 172.115856757703, 174.117203929304,
176.118551100905, 178.119898272506, 180.121245444107, 182.122592615708,
184.12393978731, 186.125286958911, 188.126634130512, 190.127981302113,
192.129328473714, 194.130675645316, 196.132022816917, 198.133369988518,
200.134717160119), b = c(2.05061146723527, 4.10122293447054,
6.15183440170581, 8.20244586894108, 10.2530573361764, 12.3036688034116,
14.3542802706469, 16.4048917378822, 18.4555032051174, 20.5061146723527,
22.556726139588, 24.6073376068232, 26.6579490740585, 28.7085605412938,
30.7591720085291, 32.8097834757643, 34.8603949429996, 36.9110064102349,
38.9616178774701, 41.0122293447054, 43.0628408119407, 45.113452279176,
47.1640637464112, 49.2146752136465, 51.2652866808818, 53.315898148117,
55.3665096153523, 57.4171210825876, 59.4677325498228, 61.5183440170581,
63.5689554842934, 65.6195669515287, 67.6701784187639, 69.7207898859992,
71.7714013532345, 73.8220128204697, 75.872624287705, 77.9232357549403,
79.9738472221756, 82.0244586894108, 84.0750701566461, 86.1256816238814,
88.1762930911166, 90.2269045583519, 92.2775160255872, 94.3281274928224,
96.3787389600577, 98.429350427293, 100.479961894528, 102.530573361764,
104.581184828999, 106.631796296234, 108.682407763469, 110.733019230705,
112.78363069794, 114.834242165175, 116.88485363241, 118.935465099646,
120.986076566881, 123.036688034116, 125.087299501351, 127.137910968587,
129.188522435822, 131.239133903057, 133.289745370293, 135.340356837528,
137.390968304763, 139.441579771998, 141.492191239234, 143.542802706469,
145.593414173704, 147.644025640939, 149.694637108175, 151.74524857541,
153.795860042645, 155.846471509881, 157.897082977116, 159.947694444351,
161.998305911586, 164.048917378822, 166.099528846057, 168.150140313292,
170.200751780527, 172.251363247763, 174.301974714998, 176.352586182233,
178.403197649469, 180.453809116704, 182.504420583939, 184.555032051174,
186.60564351841, 188.656254985645, 190.70686645288, 192.757477920115,
194.808089387351, 196.858700854586, 198.909312321821, 200.959923789056,
203.010535256292, 205.061146723527)), .Names = c("a", "b"), row.names = c(NA,
-100L), class = "data.frame")
Simply change your second expression (t+2*(i-1)*n) to (t+(i-1)*n) or to align with first expression ((i-1)*n+t). Also, consider lapply over the for loop as its return is a list equal to input seq(N) or [1] 1 2 3:
N <- 3
n<-10
t<-50
dfList <- lapply(seq(N), function(i) {
a <- (1:100)*rnorm(1,0.5)
b <- (1:100)*rnorm(1,2)
datadf <- as.data.frame(cbind(a,b))
datadf[((i-1)*n+1):((i-1)*n+t),]
})
Or an easier read:
dfList <- lapply(seq(N), function(i) {
a <- (1:100)*rnorm(1,0.5)
b <- (1:100)*rnorm(1,2)
s <- (i-1)*n
datadf <- as.data.frame(cbind(a,b))
datadf[(s+1):(s+t),]
})

How to use a script on multiple excel working sheets in r studio

I have a rather long script, at least for me, but i'm new in R.
It looks like this
#DATA IS LOADED
library(xlsx)
df=read.xlsx2("/Users/emiliemariafalkkallenbach/Documents/Norsk_Institut_for_vandforskning/Chase_DK.xlsx",sheetIndex=1)
df[df==""]<- NA
df=data.matrix(df)
df=data.frame(df)
attach(df)
Phyto=df[,1:6]
Submerged=df[,7:12]
Benthic=df[13:18]
# PHTYOPLANKTON
#making of boundaries
a<-(ifelse(P_Resp<=0,(1-P_Acdev),1/(1+P_Acdev)))
b=(0.95+a)/2
c=2*a-b
d=2*c-a
#Weighted boundaries
e=0.95*P_weight
e=sum(e)
f=b*P_weight
f=sum(f)
g=a*P_weight
g=sum(g)
h=c*P_weight
g=sum(g)
i=d*P_weight
i=sum(i)
#EQR weighted
j=P_EQR*P_weight
j=sum(j)
#complete
l <- ifelse(is.na(rowSums(Phyto)),1,0)
m=sum(l)
#SUBMERGED VEGETATION
#Overall assesment
y=ifelse(m<=0,j,"")
z=ifelse(y!="NA",0,(ifelse(y>f,1,(ifelse(y>g,2,(ifelse(y>h,3,(ifelse(y>i,4,5)))))))))
#making of boundaries
a.sub<-(ifelse(S_Resp<=0,(1-S_Acdev),1/(1+S_Acdev)))
a.sub=sum(a.sub)
b.sub=(0.95+a.sub)/2
b.sub=sum(b.sub)
c.sub=2*a.sub-b.sub
c.sub=sum(c.sub)
d.sub=2*c.sub-a.sub
d.sub=sum(d.sub)
#Weighted boundaries
e.sub=0.95*S_weight
e.sub=sum(e)
f.sub=b.sub*S_weight
f.sub=sum(f)
g.sub=a.sub*S_weight
g.sub=sum(g)
h.sub=c.sub*S_weight
h.sub=sum(h)
i.sub=d.sub*S_weight
i.sub=sum(i)
#EQR.sub weighted
j.sub=S_EQR*S_weight
j.sub=sum(j.sub)
#complete.sub
l.sub <- ifelse(is.na(rowSums(Submerged)),1,0)
m.sub=sum(l.sub)
#Overall Assesment
q.sub=m.sub*0.75
y.sub=ifelse(m.sub<=0,j.sub,"")
z.sub=ifelse(y.sub!="NA",(ifelse(y.sub>f.sub,1,(ifelse(y.sub>g.sub,2,(ifelse(y.sub>h.sub,3,(ifelse(y.sub>i.sub,4,5)))))))),0)
BENTHIC INVERTEBRATES
#making of boundaries
a.ben<-(ifelse(B_Resp<=0,(1-B_Acdev),1/(1+B_Acdev)))
b.ben=(0.95+a.ben)/2
c.ben=2*a.ben-b.ben
d.ben=2*c.ben-a.ben
#Weighted boundaries
e.ben=0.95*B_weight
e.ben=sum(e.ben)
f.ben=b.ben*B_weight
f.ben=sum(f.ben)
g.ben=a.ben*B_weight
g.ben=(sum(g.ben))
h.ben=c.ben*B_weight
h.ben=sum(h.ben)
i.ben=d.ben*B_weight
i.ben=sum(i.ben)
#EQR weighted
j.ben=B_EQR*B_weight
#Complete
l.ben <- ifelse(is.na(rowSums(Benthic)),1,0)
m.ben=sum(l.sub)
#ChkAccDev
n.ben=ifelse(B_Resp>0,0.53,1.1)
o.ben=ifelse(B_Acdev<0.15,-1,0)
p.ben=ifelse(B_Acdev>n.ben,1,o.ben)
#Overall Assesment
q.ben=m.ben*0.75
y.ben=ifelse(m.ben<1,j.ben,"")
z.ben=ifelse(y.ben!="NA",ifelse(y.ben>f.ben,1,(ifelse(y.ben>g.ben,2,(ifelse(y.ben>h.ben,3,(ifelse(y.ben>i.ben,4,5))))))),0)
#Final assesment
Z=max(na.omit(c(z,z.sub,z.ben)))
#Overall assesment
SCORE=(ifelse(Z<=1 && Z<2,"HIGH",(ifelse(Z>=2 && Z<3,"GOOD",(ifelse(Z>=3 && Z<4,"MODERATE",(ifelse(Z>=4 && Z<5,"BAD",(ifelse(Z>=5,"POOR","NA"))))))))))
SCORE
#Pie Chart
library('plotrix')
Phyto=33.3
Submerged=33.3
Benthic=33.3
total=100
Slices1=c=(1)
total=100
iniR=0.2
Slices1=c=(1)
pie(1, radius=iniR, init.angle=90, col=c('white'), border = NA, labels='Overall')
colors=c(ifelse(z<=1,"blue", ifelse(z<=2,"green",ifelse(z<=3,"yellow",ifelse(z<=4,"orange",ifelse(z<=5,"red","red"))))), ifelse(z.sub<=1,"blue", ifelse(z.sub<=2,"green", ifelse(z.sub<=3,"yellow",ifelse(z.sub<=4,"orange",ifelse(z.sub<=5,"red","red"))))),ifelse(z.ben<=1,"blue", ifelse(z.ben<=2,"green", ifelse(z.ben<=3, "yellow", ifelse(z.ben<=4,"orange",ifelse(z.ben<=5,"red","red"))))))
floating.pie(0,0,c(Phyto, Submerged, Benthic),radius=5*iniR, startpos=pi/2, col=colors,border=NA)
Slices1=c=(1)
total=100
iniR=0.2 # initial radius
colorst=c(ifelse(Z<=1,"blue", ifelse(Z<=2,"green",ifelse(Z<=3,"yellow",ifelse(Z<=4,"orange",ifelse(Z<=5,"red","red"))))))
floating.pie(0,0,c(total),radius=3*iniR, startpos=pi/2, col=colorst,border=NA)
angles=as.numeric(c(-10,75,80))
pie.labels(0,0,angles,c("Phyto","Submerged","Benthic"),radius=1, bg="white")
pie.labels(0,0,0,c("Overall Assessment"),radius=-0.3)
I guess it does not matter, what is into my script.
At the moment it only runs the first sheet in excel, but I have several and would like to run them on all of them.
The outcome should be a table looking like this
z z.sub z.ben Z Pie-chart (only if possible)
Sheet 1 0 NA NA High
Sheet 2 ... ... ... ...
Sheet 3 ... ... ... ...
I'm sorry if this is an ordinary question!
Hope someone is able to help
Thanks!
a better way to read xls is the library readxl.
# remove "#" if you don't have these libraries installed already
# install.packages("readxl") # faster excel reader
# install.packages("data.table") # faster everything, in this case rbindlist
library(readxl)
library(data.table)
sheets = 1:5 # index numbers or names of the sheets you want to read
readmysheets = function(sheets) {
df = read_excel(file="myexcel.xls", sheets)
}
myfiles = lapply(sheets, readmysheets) # apply the indices/names on the readmysheets function
# you now have a list:
str(myfiles)
# bind the separate sheets together
together = rbindlist(myfiles, fill = T)
Wrap the body of your script into a "for" loop. You have several was of doing this, here are two.
# pre-allocate an object to write to
out <- matrix(rep(NA, numsheets * numcols), ncols = numcols))
for (i in 1:nsheets) {
df=read.xlsx2(".../Chase_DK.xlsx", sheetIndex = i)
#... do calculations
out[, i] <- c(z, z.sub, z.ben, Z)
}
# second way, no need to pre-allocate anything
sapply(1:nsheets, FUN = function(x) {
df=read.xlsx2(".../Chase_DK.xlsx", sheetIndex = x)
#... do calculations
out <- c(z, z.sub, z.ben, Z) # specify what you wish the function to return
return(out) # sapply will try to simplify the combined result on its own
})

R loop for creating and using -csv

I have a function output (from koRpus) of the form:
Total number of tokens: 887
Total number of types: 393
Measure of Textual Lexical Diversity
MTLD: 142.66
Number of factors: 6.22
Factor size: 0.72
SD tokens/factor: 41.55 (all factors)
38 (complete factors only)
And I want to make a loop for storing these results for 80 different documents. I have tried the following:
for (i in 1:length(infra$tableid)) {
whypar <- paste(infra$whypar [infra[,1] ==i], collapse=" ")
wpi<- removeWords(whypar, stopwords("english"))
as.data.frame(wpi)
write.csv(data.frame(wpi), file= "wp.csv")
tagged.text <- tokenize("wp.csv", lang="en")
res.mtld <- MTLD(tagged.text)
write.csv(data.frame(res.mtld),file="output.csv")
}
where infra is:
tableid 1, 2, 3, ... 80
whypar "I took part because xxx", "I believe that jshfdjk", "jhsadkjhd" ... (N=350)
Thanks for any help
Extract the parts of the MTLD object you are interested in first. From your question it seems like you are only interested in a subset of the object returned by MTLD, namely the MTLD score, number of factors the SD of tokens/factor and the SD for complete factors only. If you only want these results for each file you can just write one nice table as your output for all the files:
res <- data.frame( ID = numeric() , MTLD=numeric() , Factor_Size=numeric() , SD=numeric() , SD_Complete=numeric() )
for (i in 1:length(infra$tableid)) {
whypar <- paste(infra$whypar [infra[,1] ==i], collapse=" ")
wpi<- removeWords(whypar, stopwords("english"))
wpi <- as.data.frame(wpi)
write.csv(data.frame(wpi), file= "wp.csv")
tagged.text <- tokenize("wp.csv", lang="en")
res.mtld <- MTLD(tagged.text)
mtld <- res.mtld#MTLD$MTLD
fac.size <- res.mtld#param$factor.size
mtld.sd <- res.mtld#MTLD$lengths$sd
mtld.sd.compl <- res.mtld#MTLD$lengths$sd.compl
res <- rbind( res , c( infra$tableid[i] , mtld, fac.size , mtld.sd , mtld.sd.compl ) )
}
write.csv( res , file="output.csv" )
I hope this helps, but check these are the results you want returned.

Resources