svm file handling in R - r

I have a multi-label classification problem. I have a dataset available at the following link:
dataset
This data set is originally from siam competition 2007. The dataset comprises of aviation safety reports describing the problem(s) which occurred in certain flights. It is a multi-classification, high dimensional problem. It has 21519 rows and 30438 columns.
The dataset contains .svm format file.
I have read the file with the help of "read.delim" in R.
After that I got following output:
head(data[,1])
1 18 2:0.136082763488 6:0.136082763488 7:0.136082763488
12:0.136082763488 20:0.136082763488 23:0.136082763488
32:0.136082763488 37:0.136082763488 39:0.136082763488
43:0.136082763488 53:0.136082763488 57:0.136082763488
58:0.136082763488 59:0.136082763488 60:0.136082763488
61:0.136082763488 62:0.136082763488 63:0.136082763488
64:0.136082763488 65:0.136082763488 66:0.136082763488
67:0.136082763488 68:0.136082763488 69:0.136082763488
70:0.136082763488 71:0.136082763488 72:0.136082763488
73:0.136082763488 74:0.136082763488 75:0.136082763488
76:0.136082763488 77:0.136082763488 78:0.136082763488
79:0.136082763488 80:0.136082763488 81:0.136082763488
82:0.136082763488 83:0.136082763488 84:0.136082763488
85:0.136082763488 86:0.136082763488 87:0.136082763488
88:0.136082763488 89:0.136082763488 90:0.136082763488
91:0.136082763488 92:0.136082763488 93:0.136082763488
94:0.136082763488 95:0.136082763488 96:0.136082763488
97:0.136082763488 98:0.136082763488 99:0.136082763488
[2] 1,12,13,18,20 2:0.0916698497028 4:0.0916698497028
6:0.0916698497028 12:0.0916698497028 14:0.0916698497028
16:0.0916698497028 19:0.0916698497028 23:0.0916698497028
26:0.0916698497028 31:0.0916698497028 32:0.0916698497028
33:0.0916698497028 37:0.0916698497028 53:0.0916698497028
57:0.0916698497028 66:0.0916698497028 71:0.0916698497028
72:0.0916698497028 81:0.0916698497028 83:0.0916698497028
84:0.0916698497028 86:0.0916698497028 90:0.0916698497028
92:0.0916698497028 100:0.0916698497028 101:0.0916698497028
102:0.0916698497028 103:0.0916698497028 104:0.0916698497028
105:0.0916698497028 106:0.0916698497028 107:0.0916698497028
108:0.0916698497028 109:0.0916698497028 110:0.0916698497028
111:0.0916698497028 112:0.0916698497028 113:0.0916698497028
114:0.0916698497028 115:0.0916698497028 116:0.0916698497028
117:0.0916698497028 118:0.0916698497028 119:0.0916698497028
120:0.0916698497028 121:0.0916698497028 122:0.0916698497028
123:0.0916698497028 124:0.0916698497028 125:0.0916698497028
126:0.0916698497028 127:0.0916698497028 128:0.0916698497028
129:0.0916698497028 130:0.0916698497028 131:0.0916698497028
132:0.0916698497028 133:0.0916698497028 134:0.0916698497028
135:0.0916698497028 136:0.0916698497028 137:0.0916698497028
138:0.0916698497028 139:0.0916698497028 140:0.0916698497028
141:0.0916698497028 142:0.0916698497028 143:0.0916698497028
144:0.0916698497028 145:0.0916698497028 146:0.0916698497028
147:0.0916698497028 148:0.0916698497028 149:0.0916698497028
150:0.0916698497028 151:0.0916698497028 152:0.0916698497028
153:0.0916698497028 154:0.0916698497028 155:0.0916698497028
156:0.0916698497028 157:0.0916698497028 158:0.0916698497028
159:0.0916698497028 160:0.0916698497028 161:0.0916698497028
162:0.0916698497028 163:0.0916698497028 164:0.0916698497028
165:0.0916698497028 166:0.0916698497028 167:0.0916698497028
168:0.0916698497028 169:0.0916698497028 170:0.0916698497028
171:0.0916698497028 172:0.0916698497028 173:0.0916698497028
174:0.0916698497028 175:0.0916698497028 176:0.0916698497028
177:0.0916698497028 178:0.0916698497028 179:0.0916698497028
180:0.0916698497028 181:0.0916698497028 182:0.0916698497028
183:0.0916698497028 184:0.0916698497028 185:0.0916698497028
186:0.0916698497028 187:0.0916698497028 188:0.0916698497028
189:0.0916698497028 190:0.0916698497028 191:0.0916698497028
192:0.0916698497028 193:0.0916698497028 194:0.0916698497028
How can I convert it into the regular dataset?
Any other method than read.delim for reading ".svm" file in R will also be helpful.

Maybe the solution contains a number of loops. But it solved my problem.
Below is the R-code :
rm(list=ls())
data <- read.delim(file.choose(),header=F)
# Now using strsplit function to create a regular dataser
temp <- list()
for(i in 1:length(data$V1)){
temp[i] <- strsplit(as.character(data$V1[i]),c(" "))
}
response <- list()
for(i in 1:length(temp)){
response[[i]] <- as.numeric(strsplit(temp[[i]][1],",")[[1]])
}
# Now working for responses
l.response <- 0
for (i in 1:length(response)){
l.response[i] <- length(response[[i]])
}
col.names <- paste(rep("R",22),1:22,sep="")
l.r <- length(temp)
df.response <- data.frame(R1=rep(0,l.r),R2=rep(0,l.r),R3=rep(0,l.r),R4=rep(0,l.r),R5=rep(0,l.r)
,R6=rep(0,l.r),R7=rep(0,l.r),R8=rep(0,l.r),R9=rep(0,l.r),R10=rep(0,l.r)
,R11=rep(0,l.r),R12=rep(0,l.r),R13=rep(0,l.r),R14=rep(0,l.r),R15=rep(0,l.r)
,R16=rep(0,l.r),R17=rep(0,l.r),R18=rep(0,l.r),R19=rep(0,l.r),R20=rep(0,l.r)
,R21=rep(0,l.r),R22=rep(0,l.r))
for(i in 1:length(response)){
df.response[i,(response[[i]]+1)] <- 1
}
feature <- c(0)
value <- c(0)
v.l <- 21519
v.list <- list()
list.name <- paste(rep("V",v.l),1:v.l,sep="")
f.vec <- 0
v.vec <- 0
for(i in 1:length(temp)){
for(j in 2:length(temp[[i]])){
f.vec[j-1] <- as.numeric(strsplit(temp[[i]][j],":")[[1]])[1]
v.vec[j-1] <- as.numeric(strsplit(temp[[i]][j],":")[[1]])[2]
}
v.list[[i]] <- data.frame(f.vec,v.vec)
}
feature.name <- paste(rep("V",30438),1:30438,sep="")
v.l <- 21519
variables <- data.frame(temp = rep(0,v.l))
for(i in 1:length(feature.name)){
variables[,feature.name[i]] <- rep(0,v.l)
}
variables <- variables[,-1]
copy.variables <- variables
for(i in 1:100){
pos <- v.list[[i]][,"f.vec"]
replace <- v.list[[i]][,"v.vec"]
if(length(unique(pos))!=length(pos)){
repeat{
uni <- as.numeric(attr(which(table(pos)>1), "names"))
for(k in 1:length(uni)){
t.pos <- which(pos==uni[k])
pos <- pos[-t.pos[1]]
replace <- replace[-t.pos[1]]
}
if(length(unique(pos))==length(pos)) break
}
}
variables[i,pos]<- replace
}
dim(df.response)
dim(variables)
Below code will give the final data with 100 rows and 100 columns.
final.data <- cbind(variables[1:100,],df.response[1:100,])
Welcome for other solutions. #LenGreski

Related

TFBS search with TFBSTools and a for loop

I am currently looking for TFBS motifs in DNA. The original code I wrote to look for 10 different transcription factor motifs worked beautifully. But now I want to search for more than 100 motifs. Therefore I want to use loops so that I dont have to type in all the motif names manually (and can easily change my list if my interests change). But when I do this my code no longer works. I can print the result of the loops and it looks like the list contains all the correct information, but when I then use this list to compare with my DNA the list seems to only remember the last item of the list.
Here is my original code that works:
initializeJASPARDB("jaspar.sqlite", version="2014")
MA0018.2 <- getMatrixByID(db, ID="MA0018.2")
MA0024.1 <- getMatrixByID(db, ID="MA0024.1")
MA0062.1 <- getMatrixByID(db, ID="MA0062.1")
MA0099.2 <- getMatrixByID(db, ID="MA0099.2")
MA0112.1 <- getMatrixByID(db, ID="MA0112.1")
MA0153.1 <- getMatrixByID(db, ID="MA0153.1")
MA0156.1 <- getMatrixByID(db, ID="MA0156.1")
MA0157.1 <- getMatrixByID(db, ID="MA0157.1")
MA0158.1 <- getMatrixByID(db, ID="MA0158.1")
MA0259.1 <- getMatrixByID(db, ID="MA0259.1")
pwm1 <- toPWM(MA0018.2)
pwm2 <- toPWM(MA0024.1)
pwm3 <- toPWM(MA0062.1)
pwm4 <- toPWM(MA0099.2)
pwm5 <- toPWM(MA0112.1)
pwm6 <- toPWM(MA0153.1)
pwm7 <- toPWM(MA0156.1)
pwm8 <- toPWM(MA0157.1)
pwm9 <- toPWM(MA0158.1)
pwm10 <- toPWM(MA0259.1)
pwmList <- PWMatrixList(pwm1=pwm1, pwm2=pwm2, pwm3=pwm3, pwm4=pwm4, pwm5=pwm5, pwm6=pwm6, pwm7=pwm7, pwm8=pwm8, pwm9=pwm9, pwm10=pwm10)
seq3 <- readDNAStringSet(file = "sequences/HBV/HBV_genotypeEplusFplusGplusHplusRF.fasta")
sitesetList <- searchSeq(pwmList, seq3, min.score="90%")
as(sitesetList, "data.frame")
as(sitesetList, "DataFrame")
as(sitesetList, "GRanges")
#writeGFF3(sitesetList)
Viruses_TFBS <- writeGFF2(sitesetList)
write.csv(Viruses_TFBS, file = "analysis_HBV/HBV_combmanually_10TFBS.csv")
head(Viruses_TFBS, 5)
Here is my new code with the loop:
TFBS_motifs <- read.csv("TFBS_motifs_test.csv")
seqHSV <- readDNAStringSet(file = "sequences/HSV1.fasta")
# for loop objects
TFBS_matrices <- list()
pwms <- list()
TFBS_ID <- TFBS_motifs$ID
for (i in 1:nrow(TFBS_motifs)) {
pwms[[i]] <- toPWM(TFBS_matrices[[i]] <- getMatrixByID(db, ID = TFBS_ID[[i]]))
}
#print(pwms)
pwmList <- PWMatrixList(pwms[[i]])
sitesetList <- searchSeq(pwmList, seqHSV, min.score="90%")
as(sitesetList, "data.frame")
as(sitesetList, "DataFrame")
as(sitesetList, "GRanges")
#writeGFF3(sitesetList)
HSV_TFBS <- writeGFF2(sitesetList)
write.csv(HSV_TFBS, file = "analysis_HBV/HSV_test.csv")
Thank you all so much for any suggestions...
You don't really need to run this in a loop; most of these commands can deal with lists. Try something like this:
library(TFBSTools)
library(JASPAR2018) # or use 2014 version if you prefer
library(Biostrings)
db <- file.path(system.file("extdata", package="JASPAR2018"), "JASPAR2018.sqlite")
# provide your motifs here or read in from file
mymotifs <- c("MA0018.2", "MA0024.1", "MA0062.1", "MA0099.2")
PFMatrixList <- getMatrixByID(db, ID=mymotifs)
# provide your genome fasta file here
HSV1 <- readDNAStringSet("ftp://ftp.ebi.ac.uk/pub/databases/ena/wgs/public/fkj/FKJZ01.fasta.gz")
sitesetList <- searchSeq(toPWM(PFMatrixList), HSV1, min.score="90%")
# generate GRangesList (and unlist if you prefer a single GRanges object)
unlist(as(sitesetList, "GRangesList"))

Subsetting a data set and plotting means

I have a data set including Year, Site, and Species Count. I am trying to write a code that reflects in some years, the counts were done twice. For those years I have to find the mean count at each site for each species (there are two different species), and plot those means. This is the code I have generated:
DataSet1 <- subset(channel_islands,
channel_islands$SpeciesName=="Hypsypops ubicundus, adult" |
channel_islands$SpeciesName=="Paralabrax clathratus,adult")
years<-unique(DataSet1$Year)
Hypsypops_mean <- NULL
Paralabrax_mean <- NULL
Mean <- NULL
years <- unique(DataSet1$Year)
for(i in 1:length(years)){
data_year <- DataSet1[which(DataSet1$Year == years[i]), ]
Hypsypops<-data_year[which(data_year$SpeciesName=="Hypsypops rubicundus,adult"), ]
Paralabrax<-data_year[which(data_year$SpeciesName=="Paralabrax clathratus,adult"), ]
UNIQUESITE<-unique(unique(data_year$Site))
for(m in 1:(length(UNIQUESITE))){
zz<-Hypsypops[Hypsypops$Site==m,]
if(length(zz$Site)>=2){
Meanp <- mean(Hypsypops$Count[Hypsypops$Site==UNIQUESITE[m]])
Hypsypops_mean <- rbind(Hypsypops_mean,
c(UNIQUESITE[m], years[i], round(Meanp,2),
'Hypsypops rubicundus,adult'))
}
kk <- Paralabrax[Paralabrax$Site==m, ]
if(length(kk$Site)>=2){
Meane <- mean(Paralabrax$Count[Paralabrax$Site==UNIQUESITE[m]])
Paralabrax_mean <- rbind(Paralabrax_mean,
c(UNIQUESITE[m], years[i], round(Meane, 2),
'Paralabrax clathratus,adult'))
}
}
if(i==1){
Mean<-rbind(Hypsypops_mean, Paralabrax_mean)
}
if(i>1){
Mean<-rbind(DataMean, Hypsypops_mean, Paralabrax_mean)
}
Hypsypops_mean<-NULL
Paralabrax_mean<-NULL
}
Mean <- as.data.frame(Mean,stringsAsFactors=F)
names(Mean) <- c('Site','Year','mean_count','SpeciesName')
Mean$Site <- as.integer(Mean$Site)
Mean$Year <- as.integer(Mean$Year)
Mean$mean_count <- as.numeric(Mean$mean_count)
par(mfrow=c(5,5), oma=c(4,2,4,2), mar=c(5.5,4,3,0))
for(i in 1:length(years)){
if(any(Mean$Year==years[i])) {
year1<-Mean[which(Mean$Year==years[i]),]
Species<-unique(as.character(year1$SpeciesName))
Colors<-c("pink","purple")[Species]
Data_Hr<-year1[year1$SpeciesName=="Hypsypops rubicundus,adult",]
Data_Pc<-year1[year1$SpeciesName=="Paralabrax clathratus,adult",]
plot(Data_Hr$mean_count~Data_Pc$mean_count,
xlab=c("Hypsypops rubicundus"),
ylab=c("Paralabrax clathratus"),main=years[i],pch=16)
}
}
It's a lot I'm sorry, I'm not sure of a way to streamline the process. But I keep getting an error:
Error in names(Mean) <- c("Site", "Year", "mean_count", "SpeciesName")
: 'names' attribute [4] must be the same length as the vector [0]
Not sure how I can debug this.
Not sure why you want to do this with an elaborate loop code. It sounds like you are trying to summarise your data.
This can be done in different ways. Here is a solution using dplyr:
DataSet1 %>%
group_by(Year, SpeciesName, Site) %>%
summarise(nrecords = n(),
Count = mean(Count))
To get a better answer, it might be helpful to post a subset of the data and the intended result you are after.

how to extract topic models result with its weights in R mallet

Anyone please help me for extracting topic models results (topic terms) along with its weights.
this is my code
mallet.instances <- mallet.import(as.character(documents$id), as.character(documents$text), "custom_stopwords.csv", FALSE, token.regexp="\\p{L}[\\p{L}\\p{P}]+\\p{L}")
n.topics <- 30
topic.model <- MalletLDA(n.topics)
topic.model$loadDocuments(mallet.instances)
vocabulary <- topic.model$getVocabulary()
word.freqs <- mallet.word.freqs(topic.model)
topic.model$setAlphaOptimization(20, 50)
topic.model$train(200)
topic.model$maximize(10)
doc.topics <- mallet.doc.topics(topic.model, smoothed=T, normalized=T)
topic.words <- mallet.topic.words(topic.model, smoothed=T, normalized=T)
topic.docs <- t(doc.topics)
topic.docs <- topic.docs / rowSums(topic.docs)
write.csv(topic.docs, "topics-docs.csv" )
topics.labels <- rep("", n.topics)
for (topic in 1:n.topics) topics.labels[topic] <-paste(mallet.top.words(topic.model, topic.words[topic,], num.top.words=10)$words, collapse=",")
topics.labels
write.csv(topics.labels, "topics-labels.csv")

How to add data in object with index in R?

first at all: I am completly new to R. So please excuse me if the question is a little crude.
I am trying multivariate clustering of functional data. Therefore I used Ramsay & Silvermans fda-Package to build basis spline expansion systems, fill them with curves and applyed the funclust to the dataset.
The funclust-Function gives a proposal as vector for the clustering named as clsResult e.g.
clsResult <- c(2,2,2,3,3,2,3,2,2)
At the next step I would like to calculate statistical measures like mean, standard deviation,... That is why I wish to separate the data for each class and calculate the statistics.
An example for the mean calculation:
uniGroups <- sort(unique(as.vector(clsResult)))
j = 1
for (i in uniGroups) {
obsItems <- which(cls %in% i)
fdClsMean[j] <- mean.fd(fdData[obsItems])
plot(fdClsMean[j])
j <- j + 1
}
The variable $fdClsMean$ should now contain the the mean-Curves for classes 2 (j=1) and 3 (j=2).
But by doing this way, I get the following error message:
Error in basisobj$type : $ operator is invalid for atomic vectors
In addition: Warning messages:
1: In fdClsMean[j] <- mean.fd(fdData[obsItems]) :
number of items to replace is not a multiple of replacement length
2: In fdClsMean[j] <- mean.fd(fdData[obsItems]) :
number of items to replace is not a multiple of replacement length
If you have some idea to fix my problem, it would deeply grateful to share this to fix my issues...
library("fda", lib.loc="~/R/win-library/3.3")
library("Funclustering", lib.loc="~/R/win-library/3.3")
library("RColorBrewer")
dataParam1 <- structure(c(0.983981396374184, 0.985667565176901, 0.987353733979619,
0.989039902782336, 0.990726071585054, 0.992412240387771, 0.994098409190489,
0.995784577993206, 0.997470746795924, 0.999156915598641, 1.00084308440136,
1.00252925320408, 1.00421542200679, 1.00590159080951, 1.00758775961223,
1.00927392841495, 1.01096009721766, 1.01264626602038, 1.0143324348231,
1.01601860362582), .Dim = c(20L, 1L))
dataParam2 <- structure(c(0.935807922166589, 0.943068751205336, 0.950253873594361,
0.957301033607757, 0.964196288650217, 0.970959127061196, 0.977617918964004,
0.984189979476357, 0.990668963023258, 0.997013182869952, 1.00323559960119,
1.00938298993635, 1.01547786768659, 1.02152013701955, 1.0274999384715,
1.0334274313317, 1.03932634191985, 1.04522750712415, 1.05115275864212,
1.05710855288827, 0.944940959736965, 0.952240113360859, 0.959441049641086,
0.966488172817292, 0.97341343344192, 0.980222400271887, 0.986902496653507,
0.993448451457477, 0.99986985906694, 1.00619310390116, 1.01243170737101,
1.01858974507763, 1.02466601555031, 1.03064904117653, 1.03651652181871,
1.04225550204583, 1.04786681255322, 1.05337761542761, 1.05881863137361,
1.06421715026136, 0.942134107403247, 0.949453882063492, 0.956590162743654,
0.96357651793391, 0.970434299161641, 0.977175121667198, 0.983804637900584,
0.990326873791272, 0.996747607095444, 1.00308113105998, 1.00933426721898,
1.01549301552775, 1.02154277232314, 1.02747204422399, 1.0332709082207,
1.03893203367462, 1.04446158372569, 1.04987525096791, 1.05520008265379,
1.06046511659515, 0.940314383500459, 0.947443682667925, 0.954466467035383,
0.961381593377821, 0.968188153734298, 0.974880350284026, 0.981455066595135,
0.987919638428396, 0.994283132996982, 1.00055457342423, 1.00673992698461,
1.01284173849434, 1.01885881474915, 1.02478559454537, 1.03060319500227,
1.03629277173835, 1.04185574786414, 1.04730757048597, 1.05268469714051,
1.05802280631965, 0.942200273210682, 0.949537004317527, 0.956623599167406,
0.963557498320244, 0.970424531942181, 0.977192573805136, 0.983814211488919,
0.990277133991018, 0.996653057498476, 1.00300075654431, 1.00933625505411,
1.01557529576456, 1.02166231349206, 1.02762357846321, 1.03350687583051,
1.0393552762534, 1.04517784054144, 1.05097645707751, 1.05675187074763,
1.06250633686837, 0.918631352137535, 0.926168759248379, 0.933383610717465,
0.940494686251366, 0.947502148141247, 0.954389475599439, 0.961178237128648,
0.967896560645279, 0.974461153761216, 0.980830749331793, 0.987116074571801,
0.99342184407008, 0.99972724615763, 1.00590479212328, 1.01188232533948,
1.01767723457145, 1.02335415372681, 1.02896316014817, 1.03452862636983,
1.0400636313845, 0.948799842055992, 0.95604267242623, 0.963156714842769,
0.970143065897736, 0.97700934008937, 0.983765186828579, 0.990417394142179,
0.996974994464174, 1.00344983548711, 1.00984462195143, 1.01615468562631,
1.02236814859822, 1.02847302994779, 1.0344602037918, 1.04032333101854,
1.04606311135653, 1.0517058460477, 1.05728582670374, 1.06283690174272,
1.06837903584947, 0.937423306587466, 0.944893372711674, 0.952215313085228,
0.959400162368507, 0.966459044310274, 0.973365235191435, 0.980115166296175,
0.986739018932661, 0.993253673429198, 0.999669550405327, 1.0059957825912,
1.012243755661, 1.01842325413489, 1.02453580456187, 1.03057996323426,
1.03655461352516, 1.04246624447002, 1.04832385360279, 1.05413680517058,
1.05991330934091, 0.929064626164736, 0.936318053087355, 0.943487918232746,
0.950544729360738, 0.957477543275641, 0.964285599365032, 0.970974339717291,
0.977549594039067, 0.984023864258173, 0.990424135490678, 0.996768075950284,
1.00305982852269, 1.00929091001667, 1.01544534554472, 1.02150212798765,
1.02744072592741, 1.03324516748151, 1.03892952896684, 1.04453104163349,
1.05008784822354, 0.93174505802589, 0.93913180070802, 0.946369086629952,
0.953430967475995, 0.960342639111208, 0.967156853413704, 0.973912193750937,
0.980515241879027, 0.986922063141707, 0.993226171372671, 0.999502978127663,
1.00578905610495, 1.01204829399176, 1.01823525590035, 1.02431611212046,
1.03025344575662, 1.03602479561377, 1.04166823872877, 1.04723886048702,
1.05279403456375), .Dim = c(20L, 10L))
dataParam3 <- structure(c(1.09752068287775, 1.06097903366602, 1.02849490570095,
1.00006829898254, 0.975699213510778, 0.955387649285669, 0.939133606307212,
0.926937084575411, 0.918798084090263, 0.91471660485177, 0.914692646859928,
0.918726210114739, 0.926817294616202, 0.938965900364315, 0.955172027359076,
0.975435675600487, 0.999756845088542, 1.02813553582325, 1.06057174780459,
1.09706548103259, 1.15326107555771, 1.09807034962895, 1.05134196032301,
1.0130026453226, 0.98288375314881, 0.960918342480959, 0.947105518522033,
0.941413952185788, 0.942411639052145, 0.946950432124294, 0.95383973098054,
0.963066506646878, 0.974630759123311, 0.98853248840984, 1.00477169450647,
1.02334837741319, 1.04426253713001, 1.06751417365691, 1.09310328699392,
1.12102987714102, 1.08280188279282, 1.0339980008814, 0.99569456675958,
0.96693069628092, 0.945845840346565, 0.930481541243696, 0.919050991554413,
0.911059810335648, 0.906507018011789, 0.905392614582835, 0.907716600048786,
0.913478974409641, 0.9226797376654, 0.935318889816062, 0.951396430861627,
0.970912360802095, 0.993866679637463, 1.02025938736774, 1.05009048399292,
1.083359969513, 1.07095942421253, 1.03585502956334, 1.00549026148582,
0.979865119979976, 0.958979605045805, 0.942833716683301, 0.931427454892467,
0.924759134770715, 0.922715496413862, 0.925000988457256, 0.931232021681892,
0.940974907301548, 0.954057033380912, 0.970475602146626, 0.99023061359869,
1.0133220677371, 1.03974996456186, 1.06951430407298, 1.10261508627044,
1.13905231115425, 1.11173319871675, 1.07474700125511, 1.04289364687089,
1.015144362519, 0.991499148199451, 0.971958003912224, 0.956520929657326,
0.94518792543476, 0.937950419475085, 0.934666308796689, 0.935126761169374,
0.939286949471352, 0.947146873129281, 0.95870653214316, 0.97396592651299,
0.992925056238774, 1.01558392132051, 1.0419425217582, 1.07200085755185,
1.10575892870146, 1.18452158624921, 1.12370399561361, 1.07335363604345,
1.03347050753872, 1.00405461009944, 0.985101985787768, 0.975484170734052,
0.971553828495674, 0.970543650731045, 0.971115755631005, 0.973165208428402,
0.97669200912324, 0.981696157715515, 0.988177654205224, 0.996136498592366,
1.00557269087694, 1.01648623105895, 1.02887711913838, 1.04274535511525,
1.05809093898954, 1.06649113627055, 1.024759151488, 0.991993644427598,
0.966493902211868, 0.946597045425109, 0.931024085529654, 0.919567682099019,
0.912227835133204, 0.90900454463221, 0.909897810596039, 0.914907633024689,
0.92403008262567, 0.936993197222042, 0.953120075509914, 0.972106114234723,
0.993944684266615, 1.01863578560559, 1.04617941825165, 1.07657558220479,
1.10982427746502, 1.1253871751946, 1.08517168478315, 1.05718721613491,
1.03386322378679, 1.01386163522085, 0.996961380709181, 0.983162460251796,
0.972464873848689, 0.96486862149986, 0.960373703205309, 0.958980118965034,
0.960687868779034, 0.965496952647307, 0.973407370569853, 0.984419122546672,
0.998532208577764, 1.01574662866313, 1.03606238280277, 1.05947947099668,
1.08599789324487, 1.10370505705101, 1.07105955490089, 1.042537295611,
1.01813827918133, 0.997862505611908, 0.981709974902708, 0.969649294946133,
0.961487003409706, 0.957046106397668, 0.956305216070678, 0.959264332428738,
0.965923455471853, 0.976282585200022, 0.990341721613246, 1.00810086471153,
1.02956001449487, 1.05471917096326, 1.08357833411671, 1.11613750395523,
1.15239574124661, 1.17014704608091, 1.12539391342114, 1.08622797779053,
1.05264923918907, 1.02465769761678, 1.00225335307363, 0.985253167489029,
0.972675875945994, 0.963752595681981, 0.958418763926775, 0.956674380680374,
0.95851944594278, 0.963953959713992, 0.972977921994012, 0.985591332782836,
1.00179419208046, 1.02158649988689, 1.04496825620213, 1.07193946102617,
1.10250011435902), .Dim = c(20L, 10L))
dataParam4 <- structure(c(0.998027622801379, 0.998139005663452, 0.998271051914594,
0.998423328203811, 0.998595824570253, 0.998787296486765, 0.998993932660538,
0.999210129159112, 0.999431146138834, 0.999656257754324, 0.999883808694126,
1.00011168941983, 1.00033957014554, 1.00056745087124, 1.00079533159695,
1.00102321232266, 1.00125109304836, 1.00147897377407, 1.00170685449977,
1.00193473522548, 0.998889268641521, 0.998995305981988, 0.999133764074945,
0.99928651296306, 0.999452207584217, 0.999630847938417, 0.99982243402566,
1.00002692932507, 1.00024266452332, 1.00046582375815, 1.00069345965806,
1.00092367291089, 1.00115613664618, 1.00139085086393, 1.00162781556414,
1.0018670307468, 1.00210849641193, 1.0023521950849, 1.00259752350583,
1.0028432697472, 0.99858396998391, 0.99870387003, 0.998847936377362,
0.999013112737924, 0.999193672030613, 0.999385662628316, 0.999586590108664,
0.999795769265012, 1.00001319873968, 1.00023887520213, 1.00047211431565,
1.00071073589431, 1.00095287258822, 1.00119740014348, 1.00144380788341,
1.00169207895861, 1.00194221336906, 1.00219421111478, 1.00244807219575,
1.00270379661199, 0.998270181390647, 0.998401339075749, 0.998551858486823,
0.998714133501672, 0.998888127245082, 0.999073839717051, 0.999271270917581,
0.999480396599729, 0.999699586873158, 0.999924714214518, 1.00015335625382,
1.00038507691478, 1.00061973867722, 1.00085733931215, 1.00109787881957,
1.00134135719949, 1.00158777258983, 1.00183667764293, 1.00208661796178,
1.00233662699637, 0.998524744128797, 0.99863486766656, 0.998766395000859,
0.998919231110539, 0.999090190884781, 0.99927303979413, 0.999465840024052,
0.999667849206214, 0.99987677069341, 1.00009092850323, 1.00030946243406,
1.00053217357039, 1.00075802790152, 1.00098471847949, 1.00121141596579,
1.00143811345209, 1.0016648109384, 1.0018915084247, 1.002118205911,
1.00234490339731, 0.997142773287418, 0.997216060623538, 0.997337368533094,
0.99748694099883, 0.997653350796102, 0.997832448382706, 0.998024129720826,
0.998228394810461, 0.998445199575133, 0.9986718900988, 0.998902679532502,
0.999136069032265, 0.999372057841899, 0.999610645961405, 0.999851833390782,
1.00009562013003, 1.00034200617915, 1.00059099153814, 1.00084257620701,
1.00109676018574, 0.998774949267013, 0.998910297833251, 0.999061521216158,
0.999227984680012, 0.999409687603104, 0.999605338922452, 0.99980970198149,
1.00001925408346, 1.00023376857386, 1.00045324545269, 1.00067768471994,
1.00090707945052, 1.00114095033083, 1.00137806194375, 1.00161725633342,
1.00185762361078, 1.00209895072581, 1.00234123762244, 1.00258448430066,
1.00282869076048, 0.998235072519545, 0.998344557202282, 0.998473860627503,
0.998622982795209, 0.9987919237054, 0.998980556065077, 0.999185038787191,
0.999398230779131, 0.999618154290144, 0.999844806347269, 1.00007793486253,
1.00031476590786, 1.00055208167417, 1.00078941271967, 1.00102675904437,
1.00126412064827, 1.00150149743951, 1.00173888541517, 1.00197627612265,
1.00221366684226, 0.997626171909553, 0.997748769141173, 0.997905556676881,
0.998072158983953, 0.998246774313143, 0.998429402664452, 0.998620044037878,
0.998818696371925, 0.999024892332026, 0.999237142122056, 0.999454583203959,
0.999677175584942, 0.999904919265005, 1.00013781424415, 1.00037584615344,
1.00061802820279, 1.00086189544351, 1.00110629255917, 1.00135119016537,
1.00159658826211, 0.99786182119536, 0.997943059648897, 0.998068582740041,
0.998214894107906, 0.998377432849961, 0.998555103351116, 0.998747881433229,
0.998955724328998, 0.999175663350038, 0.999400370426972, 0.999626749674934,
0.999854748906657, 1.00008436812214, 1.00031560732139, 1.0005484665044,
1.00078294567116, 1.00101904482169, 1.00125676395599, 1.00149610307404,
1.00173700388097), .Dim = c(20L, 10L))
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Param1
# ####################################################################################
xVal <- as.vector(dataParam1)
nObs <- dim(dataParam3)[2]
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Param2
# ####################################################################################
# Build the basis expansion system for Param2
fdBasisParam2 <- create.bspline.basis(rangeval = range(xVal), norder=6)
# Calculate the coefficients for Param2 as matrix at once
yVal <- as.matrix(dataParam2)
fdParam2 <- Data2fd(argvals=xVal,y=yVal, basisobj=fdBasisParam2, lambda=0)
round(fdParam2$coefs, 4)
rm(yVal)
plot(fdParam2)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Param3
# ####################################################################################
# Build the basis expansion system for Param3
fdBasisParam3 <- create.bspline.basis(rangeval = range(xVal), norder=3)
# Calculate the coefficients for Param3 as matrix at once
yVal <- as.matrix(dataParam3)
fdParam3 <- Data2fd(argvals=xVal,y=yVal, basisobj=fdBasisParam3, lambda=0)
round(fdParam3$coefs, 4)
rm(yVal)
plot(fdParam3)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Param4
# ####################################################################################
# Build the basis expansion system for Param4
fdBasisParam4 <- create.bspline.basis(rangeval = range(xVal), norder=3)
# Calculate the coefficients for Param4 as matrix at once
yVal <- as.matrix(dataParam4)
fdParam4 <- Data2fd(argvals=xVal,y=yVal, basisobj=fdBasisParam4, lambda=0)
round(fdParam4$coefs, 4)
rm(yVal)
plot(fdParam4)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Clustering multivariate functional Data with funclust algorithm
# ####################################################################################
# -- Create a multivariate functional data object
allFd <- list(fdParam2,fdParam3,fdParam4)
# -- clustering in K classes
K <- 3 # Number of clusters
thd <- 0.05 # Threshold for Cantell Scree-Test
hard <- FALSE
nLoop <- 1
cls <- c()
tik <- c()
for (i in 1:nLoop) {
clustResult <- funclust(allFd,
K=K,
thd=thd,
increaseDimension=FALSE,
hard=hard
)
tik <- rbind(round((clustResult$tik)*100, 2))
cls <- rbind(cls,clustResult$cls)
}
#
# Calculation of class-specific characteristics
#
uniGroups <- sort(unique(as.vector(cls)))
fbParam2ClsMean <- list()
fbParam3ClsMean <- list()
fbParam4ClsMean <- list()
j <- 1
for (i in uniGroups) {
obsItems <- which(cls %in% i)
# Mean values
fbParam2ClsMean[[j]] <- mean.fd(fdParam2[obsItems])
fbParam3ClsMean[[j]] <- mean.fd(fdParam3[obsItems])
fbParam4ClsMean[[j]] <- mean.fd(fdParam4[obsItems])
j <- j+1
}
plot(fbParam2ClsMean)
plot(fbParam3ClsMean)
plot(fbParam4ClsMean)
The additional errors are caused by the output of mean.fd(fdData[obsItems]), not a single element. you need list() to put it. (listname[[1]] means 1st object in the list.)
fdClsMean <- list()
for (i in uniGroups) {
obsItems <- which(cls %in% i) # what is cls ??
fdClsMean[[j]] <- mean.fd(fdData[obsItems])
plot(fdClsMean[[j]])
j <- j + 1
}
[Edited]
Your fbParamXClsMean is a class list and have three fd objects. you need to pick up one fd object when you draw it.
for example;
plot(fbParam2ClsMean[[1]])
plot(fbParam2ClsMean[[2]], add=T, col=2, lty=2)
plot(fbParam2ClsMean[[3]], add=T, col=3, lty=3)
# if use for()
a <- FALSE
for(i in 1:3) {
plot(fbParam2ClsMean[[i]], col = i, lty = i, add = a)
a <- TRUE
}
Thank you very much for your help. Sure your solution works definitely but I would like to use the capabilities of the fda-package - it provides several calculation and plotting methods for handling of these objects.
Therefore I'd like to propose my solution:
Create an fd-object based on an existing b-spline expansion system (e.g. called fdParam2Mean). While the matrix fdParam2ClsMean$coefs is filled with zeros, I delete these manually.
fdParam2ClsMean <- fd(coef=NULL, basisobj=fdParam2$basis)
fdParam2ClsMean$coefs <- fdParam2ClsMean$coefs[,-1]
Request the class names and store them in uniGroups
uniGroups <- sort(unique(as.vector(cls)))
Iterate through the uniGroups for calculation of statistics (like mean, standard deviation etc.)
for (i in uniGroups) {
obsItems <- which(cls %in% i)
# Mean values
fdParam2ClsMean$coefs <- cbind(fdParam2ClsMean$coefs, as.array(mean.fd(fdParam2[obsItems])$coefs))
}
After these operations you are able to process this objects with mathematical operations or for example use the plot() command.
plot(fdParam2ClsMean)

How to use a script on multiple excel working sheets in r studio

I have a rather long script, at least for me, but i'm new in R.
It looks like this
#DATA IS LOADED
library(xlsx)
df=read.xlsx2("/Users/emiliemariafalkkallenbach/Documents/Norsk_Institut_for_vandforskning/Chase_DK.xlsx",sheetIndex=1)
df[df==""]<- NA
df=data.matrix(df)
df=data.frame(df)
attach(df)
Phyto=df[,1:6]
Submerged=df[,7:12]
Benthic=df[13:18]
# PHTYOPLANKTON
#making of boundaries
a<-(ifelse(P_Resp<=0,(1-P_Acdev),1/(1+P_Acdev)))
b=(0.95+a)/2
c=2*a-b
d=2*c-a
#Weighted boundaries
e=0.95*P_weight
e=sum(e)
f=b*P_weight
f=sum(f)
g=a*P_weight
g=sum(g)
h=c*P_weight
g=sum(g)
i=d*P_weight
i=sum(i)
#EQR weighted
j=P_EQR*P_weight
j=sum(j)
#complete
l <- ifelse(is.na(rowSums(Phyto)),1,0)
m=sum(l)
#SUBMERGED VEGETATION
#Overall assesment
y=ifelse(m<=0,j,"")
z=ifelse(y!="NA",0,(ifelse(y>f,1,(ifelse(y>g,2,(ifelse(y>h,3,(ifelse(y>i,4,5)))))))))
#making of boundaries
a.sub<-(ifelse(S_Resp<=0,(1-S_Acdev),1/(1+S_Acdev)))
a.sub=sum(a.sub)
b.sub=(0.95+a.sub)/2
b.sub=sum(b.sub)
c.sub=2*a.sub-b.sub
c.sub=sum(c.sub)
d.sub=2*c.sub-a.sub
d.sub=sum(d.sub)
#Weighted boundaries
e.sub=0.95*S_weight
e.sub=sum(e)
f.sub=b.sub*S_weight
f.sub=sum(f)
g.sub=a.sub*S_weight
g.sub=sum(g)
h.sub=c.sub*S_weight
h.sub=sum(h)
i.sub=d.sub*S_weight
i.sub=sum(i)
#EQR.sub weighted
j.sub=S_EQR*S_weight
j.sub=sum(j.sub)
#complete.sub
l.sub <- ifelse(is.na(rowSums(Submerged)),1,0)
m.sub=sum(l.sub)
#Overall Assesment
q.sub=m.sub*0.75
y.sub=ifelse(m.sub<=0,j.sub,"")
z.sub=ifelse(y.sub!="NA",(ifelse(y.sub>f.sub,1,(ifelse(y.sub>g.sub,2,(ifelse(y.sub>h.sub,3,(ifelse(y.sub>i.sub,4,5)))))))),0)
BENTHIC INVERTEBRATES
#making of boundaries
a.ben<-(ifelse(B_Resp<=0,(1-B_Acdev),1/(1+B_Acdev)))
b.ben=(0.95+a.ben)/2
c.ben=2*a.ben-b.ben
d.ben=2*c.ben-a.ben
#Weighted boundaries
e.ben=0.95*B_weight
e.ben=sum(e.ben)
f.ben=b.ben*B_weight
f.ben=sum(f.ben)
g.ben=a.ben*B_weight
g.ben=(sum(g.ben))
h.ben=c.ben*B_weight
h.ben=sum(h.ben)
i.ben=d.ben*B_weight
i.ben=sum(i.ben)
#EQR weighted
j.ben=B_EQR*B_weight
#Complete
l.ben <- ifelse(is.na(rowSums(Benthic)),1,0)
m.ben=sum(l.sub)
#ChkAccDev
n.ben=ifelse(B_Resp>0,0.53,1.1)
o.ben=ifelse(B_Acdev<0.15,-1,0)
p.ben=ifelse(B_Acdev>n.ben,1,o.ben)
#Overall Assesment
q.ben=m.ben*0.75
y.ben=ifelse(m.ben<1,j.ben,"")
z.ben=ifelse(y.ben!="NA",ifelse(y.ben>f.ben,1,(ifelse(y.ben>g.ben,2,(ifelse(y.ben>h.ben,3,(ifelse(y.ben>i.ben,4,5))))))),0)
#Final assesment
Z=max(na.omit(c(z,z.sub,z.ben)))
#Overall assesment
SCORE=(ifelse(Z<=1 && Z<2,"HIGH",(ifelse(Z>=2 && Z<3,"GOOD",(ifelse(Z>=3 && Z<4,"MODERATE",(ifelse(Z>=4 && Z<5,"BAD",(ifelse(Z>=5,"POOR","NA"))))))))))
SCORE
#Pie Chart
library('plotrix')
Phyto=33.3
Submerged=33.3
Benthic=33.3
total=100
Slices1=c=(1)
total=100
iniR=0.2
Slices1=c=(1)
pie(1, radius=iniR, init.angle=90, col=c('white'), border = NA, labels='Overall')
colors=c(ifelse(z<=1,"blue", ifelse(z<=2,"green",ifelse(z<=3,"yellow",ifelse(z<=4,"orange",ifelse(z<=5,"red","red"))))), ifelse(z.sub<=1,"blue", ifelse(z.sub<=2,"green", ifelse(z.sub<=3,"yellow",ifelse(z.sub<=4,"orange",ifelse(z.sub<=5,"red","red"))))),ifelse(z.ben<=1,"blue", ifelse(z.ben<=2,"green", ifelse(z.ben<=3, "yellow", ifelse(z.ben<=4,"orange",ifelse(z.ben<=5,"red","red"))))))
floating.pie(0,0,c(Phyto, Submerged, Benthic),radius=5*iniR, startpos=pi/2, col=colors,border=NA)
Slices1=c=(1)
total=100
iniR=0.2 # initial radius
colorst=c(ifelse(Z<=1,"blue", ifelse(Z<=2,"green",ifelse(Z<=3,"yellow",ifelse(Z<=4,"orange",ifelse(Z<=5,"red","red"))))))
floating.pie(0,0,c(total),radius=3*iniR, startpos=pi/2, col=colorst,border=NA)
angles=as.numeric(c(-10,75,80))
pie.labels(0,0,angles,c("Phyto","Submerged","Benthic"),radius=1, bg="white")
pie.labels(0,0,0,c("Overall Assessment"),radius=-0.3)
I guess it does not matter, what is into my script.
At the moment it only runs the first sheet in excel, but I have several and would like to run them on all of them.
The outcome should be a table looking like this
z z.sub z.ben Z Pie-chart (only if possible)
Sheet 1 0 NA NA High
Sheet 2 ... ... ... ...
Sheet 3 ... ... ... ...
I'm sorry if this is an ordinary question!
Hope someone is able to help
Thanks!
a better way to read xls is the library readxl.
# remove "#" if you don't have these libraries installed already
# install.packages("readxl") # faster excel reader
# install.packages("data.table") # faster everything, in this case rbindlist
library(readxl)
library(data.table)
sheets = 1:5 # index numbers or names of the sheets you want to read
readmysheets = function(sheets) {
df = read_excel(file="myexcel.xls", sheets)
}
myfiles = lapply(sheets, readmysheets) # apply the indices/names on the readmysheets function
# you now have a list:
str(myfiles)
# bind the separate sheets together
together = rbindlist(myfiles, fill = T)
Wrap the body of your script into a "for" loop. You have several was of doing this, here are two.
# pre-allocate an object to write to
out <- matrix(rep(NA, numsheets * numcols), ncols = numcols))
for (i in 1:nsheets) {
df=read.xlsx2(".../Chase_DK.xlsx", sheetIndex = i)
#... do calculations
out[, i] <- c(z, z.sub, z.ben, Z)
}
# second way, no need to pre-allocate anything
sapply(1:nsheets, FUN = function(x) {
df=read.xlsx2(".../Chase_DK.xlsx", sheetIndex = x)
#... do calculations
out <- c(z, z.sub, z.ben, Z) # specify what you wish the function to return
return(out) # sapply will try to simplify the combined result on its own
})

Resources