I am new to R and I don't quite know what structures to use and the correct syntax for them.
I have lists (that are more like tables with columns and column names). I would like to do the same functions to multiple lists. I assumed for loops would be reasonable to use.
My functions are
1) use a column to calculate a new column. (calculate fold change from log2foldchange)
2) make a new list using a subset of the old list and name it adjusting the name of the original list name
Here are the lines of code that worked for these tables individually.
#take values from the log2FoldChange column and calculate Fold Change
resCondition_anno$FoldChange <- 2^resCondition_anno$log2FoldChange
#subset my dataset based on the values for each row in the padj column
resCondition_anno_padj05 <- subset(resCondition_anno, resCondition$padj <= 0.05)
I would like to do these functions to multiple tables.
When I tried to do it in a for loop
resfiles1 <- c(resCondition_anno,resVirus_anno,resInter_anno)
for (i in resfiles1){
i$FoldChange <- 2^i$log2FoldChange # I was trying to calculate a new column based on log2FoldChange column
i_with_padj05 <- paste(i,"_padj05") # I was trying to create a new name like resCondition_anno_padj05
i_with_padj05 <- subset(i, i[[padj]] <= 0.05) # I was trying to subset my dataset based on values in the padj column
}
I tried to access the columns of my tables with $ and that gave me
Error: $ operator is invalid for atomic vectors
I tried to access the columns of my tables with [padj], I get
Error in subset.default(i, i[padj] <= 0.05) : object 'padj' not found
When I tried to access the columns of my table with `[[padj]], I got the following error
Error in subset.default(i, i[[padj]] <= 0.05) : object 'padj' not found
Am I going about this completely the wrong way? Is for loops reasonable way to approach my goals? I know apply functions exists but I had such a hard to getting output files out of them when I tried to input multiple files into it so I wanted to give for loops a try.
I would appreciate a code that would work for a random table and does these things and then I can figure out whether my tables are weird.
dput(head(resCondition_anno))
structure(list(ensembl = c("ENSMUSG00000051951", "ENSMUSG00000102331",
"ENSMUSG00000025902", "ENSMUSG00000104238", "ENSMUSG00000102269",
"ENSMUSG00000096126"), baseMean = c(2.34691358937965, 0.169507902147731,
49.4591642836684, 0.253911076708937, 3.27439052075304, 0.258178295608587
), log2FoldChange = c(1.04699290132002, 1.89907052894015, 0.629095304499277,
0.0597400040882164, -0.291997327218544, 1.97984690635658), lfcSE = c(1.09309963258445,
4.36961772602319, 0.291712394209747, 4.37647193807779, 1.21524080418346,
4.3263845102792), stat = c(0.95782019324678, 0.434607933236415,
2.15656008104662, 0.0136502655411644, -0.240279396654017, 0.457621577937096
), pvalue = c(0.338153434807336, 0.66384703564954, 0.0310399577136823,
0.989109002094381, 0.810113666298446, 0.647224338296786), padj = c(NA,
NA, 0.106540309680362, NA, 0.911344697137259, NA), mgi_symbol = c("Xkr4",
"Gm19938", "Sox17", "Gm37587", "Gm7357", "Gm22307"), gene_biotype = c("protein_coding",
"sense_intronic", "protein_coding", "processed_transcript", "processed_pseudogene",
"snRNA")), class = c("data.table", "data.frame"), row.names = c(NA,
-6L), .internal.selfref = <pointer: 0x0000027bef7e1ef0>)`
Expected results for the aim 1
> dput(head(resCondition_anno))
structure(list(ensembl = c("ENSMUSG00000051951", "ENSMUSG00000102331",
"ENSMUSG00000025902", "ENSMUSG00000104238", "ENSMUSG00000102269",
"ENSMUSG00000096126"), baseMean = c(2.34691358937965, 0.169507902147731,
49.4591642836684, 0.253911076708937, 3.27439052075304, 0.258178295608587
), log2FoldChange = c(1.04699290132002, 1.89907052894015, 0.629095304499277,
0.0597400040882164, -0.291997327218544, 1.97984690635658), lfcSE = c(1.09309963258445,
4.36961772602319, 0.291712394209747, 4.37647193807779, 1.21524080418346,
4.3263845102792), stat = c(0.95782019324678, 0.434607933236415,
2.15656008104662, 0.0136502655411644, -0.240279396654017, 0.457621577937096
), pvalue = c(0.338153434807336, 0.66384703564954, 0.0310399577136823,
0.989109002094381, 0.810113666298446, 0.647224338296786), padj = c(NA,
NA, 0.106540309680362, NA, 0.911344697137259, NA), mgi_symbol = c("Xkr4",
"Gm19938", "Sox17", "Gm37587", "Gm7357", "Gm22307"), gene_biotype = c("protein_coding",
"sense_intronic", "protein_coding", "processed_transcript", "processed_pseudogene",
"snRNA"), FoldChange = c(2.0662186086592, 3.72972827627808, 1.54659483966075,
1.0422779093498, 0.816770504282921, 3.94451221821964)), class = c("data.table",
"data.frame"), row.names = c(NA, -6L), .internal.selfref = <pointer: 0x0000027bef7e1ef0>)
Expected results for aim2
> dput(head(resCondition_anno_padj05))
structure(list(ensembl = c("ENSMUSG00000103922", "ENSMUSG00000025907",
"ENSMUSG00000061024", "ENSMUSG00000025911", "ENSMUSG00000025935",
"ENSMUSG00000025937"), baseMean = c(7.45083924607695, 1035.42915800337,
756.089939474399, 1510.50670239711, 2014.55644970672, 5206.99654662079
), log2FoldChange = c(3.31157886392159, -0.345358245876914, 0.340037961752993,
-0.637902858828505, 0.592795289538968, 0.59912370697665), lfcSE = c(0.984296895396084,
0.131191642000487, 0.0967702378760271, 0.120687031774959, 0.114283891072725,
0.161639505766009), stat = c(3.36441055479404, -2.63247140298489,
3.51386923517349, -5.28559572181691, 5.18704153292907, 3.70654255676794
), pvalue = c(0.000767073434065771, 0.00847661586751943, 0.000441630160084079,
1.25296333033368e-07, 2.13661093734535e-07, 0.000210107944374613
), padj = c(0.00522376704325313, 0.0385092726153939, 0.00325683272694307,
2.17721401368104e-06, 3.51690667040699e-06, 0.00168321660710376
), mgi_symbol = c("Gm6123", "Rb1cc1", "Rrs1", "Adhfe1", "Tram1",
"Lactb2"), gene_biotype = c("processed_pseudogene", "protein_coding",
"protein_coding", "protein_coding", "protein_coding", "protein_coding"
), FoldChange = c(9.92852128160573, 0.787112498791522, 1.26578990036559,
0.642646438673565, 1.5081660610658, 1.51479619975327)), class = c("data.table",
"data.frame"), row.names = c(NA, -6L), .internal.selfref = <pointer: 0x0000027bef7e1ef0>)
for aim 1
library(dplyr)
resCondition_anno_dumb <- resCondition_anno # produce a similar list
resCondition_anno_dumb$log2FoldChange <- resCondition_anno$log2FoldChange*3 # make some changes
list_t <- list(resCondition_anno, resCondition_anno_dumb) # here you enter your dataframes
# mutate adds a column to existing data sets, lapply makes it recursive
new_list <- lapply(list_t, function(x){x %>% mutate(FoldChange=2^log2FoldChange)})
for aim 2 something like
new_list <- lapply(list_t, function(x){x %>% filter(padj<=0.05)})
or you can pipe them together:
new_list <- lapply(list_t, function(x){x %>% mutate(FoldChange=2^log2FoldChange) %>% filter (padj <=0.05)})
Related
I have a large dataset of gene expression data and I'm trying to convert the gene identifiers into gene names using biomaRt in RStudio, but for some reason when I use the merge function on my data frames, my entire data table is merged wrong/erased. I've looked at the previous questions here, but no matter what I try, my code doesn't seem to work properly. Thank you infinitely!
library(biomaRt)
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
names(resdata)[1] <- "genes"
head(resdata)
## Write results
resdata <- resdata[complete.cases(resdata), ]
dim(resdata)
The problems start here:
#to convert gene accession number to gene name
charg <- resdata$genes
head(charg)
charg2 = sapply(strsplit(charg, '.', fixed=T), function(x) x[1])
ensembl = useMart("ensembl",dataset="hsapiens_gene_ensembl")
theBM = getBM(attributes='hgnc_symbol',
filters = 'ensembl_gene_id',
values = charg2,
mart = ensembl)
resdata <- merge.data.frame(resdata, theBM, by.x="genes",by.y="hgnc_symbol")
# a <- c(resdata[3])
# counts_resdata <-counts[resdata$ensembl_gene_id,]
# row.names(counts_resdata) <- resdata[,"V1"]
# cal_z_score <- function(x){
# (x - mean(x)) / sd(x)
# }
write.csv(resdata, file="diffexprresultsHEK.csv")
dev.off()
> dput(head(resdata))
structure(list(genes = structure(c("ENSG00000261150.2", "ENSG00000164877.18",
"ENSG00000120334.15", "ENSG00000100906.10", "ENSG00000182759.3",
"ENSG00000124145.6"), class = "AsIs"), baseMean = c(4093.85581350533,
2362.58393155573, 3727.90538524843, 6269.83601940967, 1514.2066991352,
4802.56186913745), log2FoldChange = c(-7.91660950515258, -5.26346217291626,
3.32325541003148, 2.95482654632078, -5.67082078657074, 2.79396304109662
), lfcSE = c(0.192088463317979, 0.149333035266368, 0.105355230912976,
0.097569264524605, 0.194208068005162, 0.0965853229316347), stat = c(-41.2133522670104,
-35.2464688307429, 31.5433356391815, 30.2843990955331, -29.1997178326289,
28.9274079776516), pvalue = c(0, 3.88608699685236e-272, 2.21307385030673e-218,
1.83983881587879e-201, 1.95527687476496e-187, 5.40010609376884e-184
), padj = c(0, 3.9601169541424e-268, 1.50348860477005e-214, 9.3744387266064e-198,
7.97009959691694e-184, 1.83432603828505e-180), `HEK-FUS1-1.counts` = c(8260.9703617894,
5075.51515177084, 665.085490083024, 1513.61286043731, 3440.18729968435,
1262.3583419615), `HEK-FUS1-2.counts` = c(8046.96326903085, 4134.79795973702,
690.697680591815, 1346.52518701783, 2499.92325557892, 1154.73922910593
), `HEK-H149A-1.counts` = c(34.3284200812733, 113.825813953696,
6450.12945737609, 10806.2252897945, 60.5264248801398, 8302.96076228903
), `HEK-H149A-2.counts` = c(33.1612031197744, 126.196800761364,
7105.70891294277, 11412.980740389, 56.1898163973955, 8490.18914319335
)), row.names = c(NA, 6L), class = "data.frame")
Here's some output (where I'm struggling):
> head(charg)
[1] "ENSG00000261150.2" "ENSG00000164877.18" "ENSG00000120334.15"
[4] "ENSG00000100906.10" "ENSG00000182759.3" "ENSG00000124145.6"
> dim(theBM)
[1] 0 1
> head(theBM)
[1] ensembl_gene_id
<0 rows> (or 0-length row.names)
> dim(resdata)
[1] 20381 11
> resdata <- merge.data.frame(resdata, theBM, by.x="genes",by.y="ensembl_gene_id")
> dim(resdata) #after merge
[1] 0 11 #isn't correct -- just row names! where'd my genes go?
Edit: Problems solved! Turns out I was referencing getBM wrong. Thank you all!
If you want to just overwrite the Ensemble IDs with the HGNC IDs you can do it in one step:
library(biomaRt)
names(resdata)[1] <- "genes"
head(resdata)
## Write results
resdata <- resdata[complete.cases(resdata), ]
dim(resdata)
charg <- resdata$genes
head(charg)
charg2 = sapply(strsplit(charg, '.', fixed=T), function(x) x[1])
ensembl = useMart(biomart = "ensembl", dataset="hsapiens_gene_ensembl")
resdata[1] = getBM(attributes='hgnc_symbol',
filters = 'ensembl_gene_id',
values = charg2,
mart = ensembl)
resdata
(This keeps Log2FC as column 3, which looks right based on the next steps in your pipeline, but if you want something different let me know and I'll update my answer to suit)
I am trying to filter the output from RNA-seq data analysis. I want to generate a list of genes that fit the specified criteria in at least one experimental condition (dataframe).
For example, the data is output as a .csv, so I read in the whole directory, as follows.
readList = list.files("~/Path/To/File/", pattern = "*.csv")
files = lapply(readList, read.csv, row.names = 1)
#row.names = 1 sets rownames as gene names
This reads in 3 .csv files, A, B and C. The data look like this
A = files[[1]]
B = files[[2]]
C = files[[3]]
head(A)
logFC logCPM LR PValue FDR
YER037W -1.943616 6.294092 34.30835 0.000000004703583 0.00002276064
YJL184W -1.771273 5.840774 31.97088 0.000000015650144 0.00003786552
YFR053C 1.990102 10.107793 30.55576 0.000000032440747 0.00005232692
YDR342C 2.096877 6.534761 28.08635 0.000000116021451 0.00014035695
YGL062W 1.649138 8.940714 23.32097 0.000001370968319 0.00132682314
YFR044C 1.992810 9.302504 22.91553 0.000001692786468 0.00132736130
I then try to filter all of these to generate a list of genes (rownames) where two conditions must be met in at least one dataset.
1.logFC > 1 or < -1
2.FDR < 0.05
So I loop through the dataframes like so
genesKeep = ""
for (i in 1:length(files) {
F = data.frame(files[i])
sigGenes = rownames(F[F$FDR<0.05 & abs(F$logFC>1), ])
genesKeep = append(genesKeep, values = sigGenes)
}
This gives me a list of genes, however, when I sanity check these against the data some of the genes listed do not pass these thresholds, whilst other genes that do pass these thresholds are not present in the list.
e.g.
df = cbind(A,B,C)
genesKeep = unique(genesKeep)
logicTest = rownames(df) %in% genesKeep
dfLogic = cbind(df, logicTest)
whilst the majority of genes do infact pass the criteria I set, I see some discrepancies for a few genes. For example
A.logFC A.FDR B.logFC B.FDR C.logFC C.FDR logicTest
YGR181W -0.8050325 0.1462688 -0.6834184 0.2162317 -1.1923744 0.04049870 FALSE
YOR185C 0.8321432 0.1462919 0.7401477 0.2191413 -0.9616989 0.04098177 TRUE
The first gene (YGR181W) passes the criteria in condition C, where logFC < -1 and FDR < 0.05. However, the gene is not reported in the genesKeep list.
Conversely, the second gene (YOR185C) does not pass these criteria in any condition, but the gene is present in the genesKeep list.
I'm unsure where I'm going wrong here, but if anyone has any ideas they would be much appreciated.
Thanks.
Using merge as suggested by akash87 solved the problem.
Turns out cbind was causing the rownames to not be assigned correctly.
I'm not exactly sure what your desired output is here, but it might be possible to simplify a bit and use the dplyr library to filter all your outputs at once, assuming the format of your data is consistent. Using some modified versions of your data as an example:
A <- structure(list(gene = structure(c(2L, 6L, 4L, 1L, 5L, 3L), .Label = c("YDR342C",
"YER037W", "YFR044C", "YFR053C", "YGL062W", "YJL184W"), class = "factor"),
logFC = c(-1.943616, -1.771273, 0, 2.096877, 1.649138, 1.99281
), logCPM = c(6.294092, 5.840774, 10.107793, 6.534761, 8.940714,
9.302504), LR = c(34.30835, 31.97088, 30.55576, 28.08635,
23.32097, 22.91553), PValue = c(4.703583e-09, 1.5650144e-08,
3.2440747e-08, 1.16021451e-07, 1.370968319e-06, 1.692786468e-06
), FDR = c(2.276064e-05, 3.786552e-05, 5.232692e-05, 0.00014035695,
0.00132682314, 0.06)), .Names = c("gene", "logFC", "logCPM",
"LR", "PValue", "FDR"), class = "data.frame", row.names = c(NA,
-6L))
B <- structure(list(gene = structure(c(2L, 6L, 4L, 1L, 5L, 3L), .Label = c("YDR342C",
"YER037W", "YFR044C", "YFR053C", "YGL062W", "YJL184W"), class = "factor"),
logFC = c(-0.4, -0.3, 0, 2.096877, 1.649138, 1.99281), logCPM = c(6.294092,
5.840774, 10.107793, 6.534761, 8.940714, 9.302504), LR = c(34.30835,
31.97088, 30.55576, 28.08635, 23.32097, 22.91553), PValue = c(4.703583e-09,
1.5650144e-08, 3.2440747e-08, 1.16021451e-07, 1.370968319e-06,
1.692786468e-06), FDR = c(2.276064e-05, 3.786552e-05, 5.232692e-05,
0.00014035695, 0.1, 0.06)), .Names = c("gene", "logFC", "logCPM",
"LR", "PValue", "FDR"), class = "data.frame", row.names = c(NA,
-6L))
Use rbind to create a single dataframe to work with:
AB<- rbind(A,B)
Then filter this whole thing based on your criteria. Note that duplicates can occur, so you can use distinct to only return unique genes that qualify:
filter(AB, logFC < -1 | logFC > 1, FDR < 0.05) %>%
distinct(gene)
gene
1 YER037W
2 YJL184W
3 YDR342C
4 YGL062W
Or, to keep all the rows for those genes as well:
filter(AB, logFC < -1 | logFC > 1, FDR < 0.05) %>%
distinct(gene, .keep_all = TRUE)
gene logFC logCPM LR PValue FDR
1 YER037W -1.943616 6.294092 34.30835 4.703583e-09 2.276064e-05
2 YJL184W -1.771273 5.840774 31.97088 1.565014e-08 3.786552e-05
3 YDR342C 2.096877 6.534761 28.08635 1.160215e-07 1.403570e-04
4 YGL062W 1.649138 8.940714 23.32097 1.370968e-06 1.326823e-03
If have two csv data frames data1 and data2 of dimension/size n1*n2 and m1*m2. I would like to create a new data frame consisting of differences: If (and only if)
data1[i,1] = data2[j,1] & data1[i,3] = data2[j,3]
then I want to consider
difference[i,z] <- abs(data1[i,x]-data2[i,y])
Is it possible to this in a simple manner, for instance using for/if?
difference <- matrix(nrow = max{n1,m1}, ncol = 3)
for (i in 1:n1) {
for (j in 1:m1) {
if(data1[i,1] == data2[j,1] & data1[i,3] == data2[j,3]){
difference[i,1] = data1[i,1]
difference[i,2] = data1[i,3]
difference[i,3] = data1[i,6]-data2[j,7]
}
}
This code is obviously far from being complete and I have several issues:
(1) I don't know if it is realizable using for loops/if conditional. If yes, being unfamiliar with R, I'm not sure if I need to put a 'print(something)' at the end of the loops.
(2) data1/2[i,1] is of type character. Hence I'm not sure if
data1[i,1] == data2[j,1] & data1[i,3] == data2[j,3]
is well-defined.
(3) The 'difference' matrix/frame should have as many rows as the number of i's and j's where
data1[i,1] = data2[j,1] & data1[i,3] = data2[j,3]
I do not know what this number is. Therefore I cannot really specify the size of 'difference'.
EDIT:
data1 = read.csv("path/to/data1.csv") ## Prices of 157 products each at
## 122 time points; (column1=Product, column3=date, column7=price)
data2 = read.csv("path/to/data2.csv") ## Prices of 118 products each at
## 122 time points; (column1=Product, column3=date, column6=price)
## the 122 time points are the same for both frames
## But: data1 contains some products data2 doesn't and vice versa
## I want to compare prices of the same products at the same time
So far, I've done it manually for product X1:
priceX1 = as.data.frame(data1[c(1,122),7])
priceX2 = as.data.frame(data2[c(5,126),6]) ## Product X2 starts at row 5
differenceX1 <- abs(priceX1 - priceX2)
The problem is I'd have to repeat this for all products contained in both data1 and data2.
RE-EDIT: dput(data1) returns
...), class = "factor"),
COMMENT = c(NA, ..., NA)), .Names = c("PRODUCT", "QUALIFIER_I",
"DATE", "QUALIFIER_II", "QUOTATION_DATE", "PROD_DATE", "PRICE",
"TYPE", "ID", "COMMENT"), row.names = c(NA, 14400L), class
= "data.frame")
"..." stands for me omitting a long list of products that couldn't fit here.
dput(data2) returns
..., NA, NA, NA)), .Names = c("PRODUCT", "QUALIFIER_II",
"DATE", "QUALIFIER_I", "Data2_source", "PRICE"), row.names = c(NA,
19161L), class = "data.frame")
"..." stand for me omitting a huge list of prices that couldn't fit in here.
You can find all pairs (i,j) which satisfy your condition by merging the two data.frames:
differences = merge(data1, data2, by=c('PRODUCT','DATE'))
This avoids for-loops entirely, and you can easily define the new column:
differences$Diff = abs(differences$PRICE.x - differences$PRICE.y)
Is there a functional form of the assignment operator? I would like to be able to call assignment with lapply, and if that's a bad idea I'm curious anyways.
Edit:
This is a toy example, and obviously there are better ways to go about doing this:
Let's say I have a list of data.frames, dat, each corresponding to a one run of an experiment. I would like to be able to add a new column, "subject", and give it a sham-name. The way I was thinking of it was something like
lapply(1:3, function(x) assign(data.frame = dat[[x]], column="subject", value=x)
The output could either be a list of modified data frames, or the modification could be purely a side effect.
dput of list starting list
list(structure(list(V1 = c(-1.16664504687199, -0.429499924318301, 2.15470735901367, -0.287839633854442, -0.850578353982526, 0.211636723222015, -0.184714165752958, -0.773553182015158, 0.801811848828454, 1.39420292299319 ), V2 = c(-0.00828185523886259, -0.0215669898046275, 0.743065397283645, -0.0268464140141802, 0.168027242784788, -0.602901928341917, 0.0740511186398372, 0.180307494696194, 0.131160421341309, -0.924995634374182)), .Names = c("V1", "V2"), row.names = c(NA, -10L), class = "data.frame"), structure(list( V1 = c(1.81912921386885, 1.17011641727415, 0.692247839769473, 0.0323050362633069, 1.35816977313292, -0.437475434344363, -0.270255715332778, 0.96140963297774, 0.914691132220417, -1.8014509598977), V2 = c(1.45082316226241, 2.05135744606495, -0.787250759618171, 0.288104852581324, -0.376868533959846, 0.531872044490353, -0.750375220117567, -0.459592764008714, 0.991667163481123, 1.31280356980115)), .Names = c("V1", "V2" ), row.names = c(NA, -10L), class = "data.frame"), structure(list( V1 = c(0.528912899341174, 0.464615157920766, -0.184211714281637, 0.526909095449027, -0.371529800682086, -0.483772861751781, -2.02134822661341, -1.30841566046747, -0.738493559993166, -0.221463545903242), V2 = c(-1.44732101816006, -0.161730785376045, 1.06294520132753, 1.22680614207705, -0.721565979363022, -0.438309438404104, -0.0243401435910825, 0.624227513999603, 0.276605218579759, -0.965640602482051)), .Names = c("V1", "V2"), row.names = c(NA, -10L), class = "data.frame"))
Maybe I don't get it but as stated in "The Art of R programming":
Any assignment statement in which the left side is not just an
identifier (meaning a variable name) is considered a replacement
function.
and so in fact you can always translate this:
names(x) <- c("a","b","ab")
to this:
x <- "names<-"(x,value=c("a","b","ab"))
the general rule is just "function_name<-"(<object>, value = c(...))
Edit to the comment:
It works with the " too:
> x <- c(1:3)
> x
[1] 1 2 3
> names(x) <- c("a","b","ab")
> x
a b ab
1 2 3
> x
a b ab
1 2 3
> x <- c(1:3)
> x
[1] 1 2 3
> x <- "names<-"(x,value=c("a","b","ab"))
> x
a b ab
1 2 3
There is the assign function. I don't see any problems with using it but you have to be aware of what environment you want to assign to. See the help ?assign for syntax.
Read this chapter carefully to understand the ins and outs of environments in detail. http://adv-r.had.co.nz/Environments.html
I have this data frame:
data
structure(list(Time = structure(1:4, .Label = c("2015-01-18 02:00:00",
"2015-01-18 03:00:00", "2015-01-18 04:00:00", "2015-01-18 05:00:00"
), class = "factor"), Server1 = c(12.92, NA, 10, 10.17), Server2 = c(13.42,
NA, 9.42, 10.83), Server3 = c(NA, 9.08, 9.17, 8.58)), .Names = c("Time",
"Server1", "Server2", "Server3"), class = "data.frame", row.names = c(NA,
-4L))
These are the variables:
dc=c("dc1")
type=c("Resource_Utilization")
app=c("DB")
metric=c(".PercentCPU")
I have to be able print each columns data in separate print line, something like this:
Server1.PercentCPU 1422165600 2 Host=Server1 source=WebTier dc=dc1 app=DB type=Resource_Utilization
I am currently doing this:
for (i in 2:ncol(data)){
data1<-data[i]
data1<-cbind(data[1],data1)
data1<-data1[complete.cases(data1),]
data1$Metric<-paste0(colnames(data[i]),metric)
data1$Time<-as.numeric(data1$Time)
n<-names(data1)
data1$Host=paste0("Host=",n[2])
data1$source=paste0("source=","WebTier")
data1$dc=paste0("dc=",dc)
data1$app=paste0("app=",app)
data1$type=paste0("type=",type)
data1<-data.frame(data1[,c(3,1,2,4,5,6,7,8)])
data1[,3]<-as.numeric(data[,3])*1024
write.table(data1, row.names=F, col.names=F, quote=F)
}
I get this error:
Error in `[<-.data.frame`(`*tmp*`, , 3, value = c(13742.08, NA, 9646.08, :
replacement has 4 rows, data has 3
There will be times where some cells will have NA. I need to come up with a way to handle the NA's in my script. Any ideas how I could do this so that I only skip the NA's cells?
This error is caused by
# drop rows with NA's
data1<-data1[complete.cases(data1),]
[lots of calcultions]
# replace all rows of the third column of the original matrix
data1[,3]<-as.numeric(data[,3])*1024
and hence, you are trying to replace a short vector (column) with a longer column.
one way around this problem is to store the index and re-use it during the assignment, as in:
# drop rows with NA's
validRows <- complete.cases(data1)
data1<-data1[validRows,]
[lots of calcultions]
# replace all rows of the third column of the original matrix that were valid
data1[,3]<-as.numeric(data[validRows,3])*1024