R apply funciton on each cell in data frame - r

I have a data frame that look something like this
> dput(tes)
structure(list(path = structure(1:6, .Label = c("1893-chicago-fair",
"1960s-afghanistan", "1970s-iran", "1980s-new-york", "20-bizarre-vintage-ads",
"20-bizarre-vintage-ads?utm_campaign=6678&utm_medium=rpages&utm_source=Facebook&utm_term=1e8e704f7b587515c72e6cf7895d55fd110b652c480d98c1440f0a7acba5fb0e",
"20-photos-segregation-america-show-far-weve-come-much-farther-go",
"7-bizarre-cultural-practices", "7-creepy-abandoned-cities?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=4015a7368b588ff09694c96ba720c58f4e7f41a05b4181908b582bae682bef5e",
"a-brief-history-of-hippies", "abandoned-photographs", "albert-kahn",
"amazing-facts", "american-bison-extinction-1800s", "american-english-vs-british-english",
"andre-the-giant-photos", "andre-the-giant-photos??utm_source=facebook&sr_source=lift_facebook&utm_campaign=simplereach_andre-the-giant-photos&utm_medium=social",
"andre-the-giant-photos?grvVariant=d27feef0bfad84d60f335d3a8d241d9e",
"andre-the-giant-photos?grvVariant=d27feef0bfad84d60f335d3a8d241d9e&utm_campaign=gravityus2_142deb68f67fb1a99e7b80250fecc932&utm_medium=referral&utm_source=gravity",
"andre-the-giant-photos?grvVariant=d27feef0bfad84d60f335d3a8d241d9e&utm_campaign=gravityus2_16d63cf07ecf656f602b2d6b209344f7&utm_medium=referral&utm_source=gravity",
"andre-the-giant-photos?grvVariant=d27feef0bfad84d60f335d3a8d241d9e&utm_campaign=gravityus2_713050ecffc51540af02b2246ddf57dd&utm_medium=referral&utm_source=gravity",
"andre-the-giant-photos?grvVariant=d27feef0bfad84d60f335d3a8d241d9e&utm_campaign=gravityus2_c5bb3bc5e9408e0ad52ec9e787bd8654&utm_medium=referral&utm_source=gravity",
"andre-the-giant-photos?sr_source=lift_facebook&utm_campaign=simplereach_andre-the-giant-photos&utm_medium=social&utm_source=facebook",
"astounding-aerial-photography", "astounding-aerial-photography?utm_campaign=7002&utm_medium=rpages&utm_source=Facebook&utm_term=38e9e903d9ba59106d8b4d19be593f3de7ff8b91b12eafa03f2e382228f7b0d1",
"august-landmesser", "ben-franklin", "best-all-that-is-interesting-articles",
"bigfoot-facts", "celebrity-school-photos?grvVariant=82c0ce57a33dfd0209bdefc878665de0&utm_campaign=gravityus2_bc8646aefd6d0a16af03d7caf248f226&utm_medium=referral&utm_source=gravity",
"coolest-mushrooms?utm_campaign=taboolaINTL&utm_medium=referral&utm_source=taboola",
"craziest-ways-drugs-smuggled", "creepy-halloween-costumes",
"danakil-depression", "dark-john-lennon-quotes", "david-bowie-quotes",
"days-in-groundhog-day", "death-photos", "death-photos?utm_campaign=taboolaINTL&utm_medium=referral&utm_source=taboola",
"dr-seuss-quotes", "dream-chaser-spacecraft", "dust-bowl", "earth-two-planets",
"eixample-barcelona", "email-to-space", "evil-science-experiments",
"famous-incest", "famous-spies", "fun-facts-trivia", "golden-age-air-travel?utm_campaign=taboolaINTL&utm_medium=referral&utm_source=taboola",
"gross-foods", "gross-foods?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=106965c54919c24bf37356500ec50f0709b1de621d6950bb4c5d48759ea3677e",
"gross-foods?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=184e0ee39e66af82f9b124b904f6e07964b211e902cb0dc00c28771ff46163a2",
"gross-foods?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=1a0ddea7bed770d5473c45e9f8d81dfd0c4fdd232f207c6b88b53c41ff220c59",
"gross-foods?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=538659f1fc53f28d2c87b93ac73973681c1a46a04954964ab6c52ed1ab09b33a",
"gross-foods?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=87caf0acb91ae2b202f1b00ad9eaad3fef20bbfb23405b9047fb2b5a5462ab9c",
"gross-foods?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=91eae42c8fc9568103d46e0b6b6ec08fc34fd68b2e1918ffe2333ec73035c95a",
"gross-foods?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=a72946874b2003a8e40635c6cf10c851d4e1c0ed45e645d69663214239550602",
"gross-foods?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=ab594f0a1be002c8c3db297e8d33b04678af40e6a6469ac815884ae0a014b3a3",
"gross-foods?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=fb1e333dd58cb7bb9251ec52290aae21771149f73e083440047068a69aaeae09",
"hilarious-insults", "hippie-communes", "hippie-communes?grvVariant=fda07538efb1c25617f7cc3d09c37c79",
"hippie-communes?grvVariant=fda07538efb1c25617f7cc3d09c37c79&utm_campaign=gravityus2_e3cd42d4745768460dab4694a972fd82&utm_medium=referral&utm_source=gravity",
"hippie-communes?pp=0", "history-of-the-vibrator", "history-of-the-vibrator?utm_campaign=whfbpd&utm_medium=social&utm_source=facebook",
"homosexuality-norm", "hunger-games-facts?utm_campaign=6905&utm_medium=rpages&utm_source=Facebook&utm_term=1a9e42ac8abb6ffa90bf0542206505e74d3df12114a2c4445527fb2b88ef8880",
"influential-photographs", "ingeniously-creative-ads", "insane-cults",
"insane-rulers", "inspirational-quotes", "inspirational-quotes?utm_medium=referral&utm_source=taboolainternal",
"interesting-facts-about-the-world", "interesting-quotes", "krokodil",
"making-a-murderer-theories", "maya-angelou-greatest-quotes",
"medieval-torture-devices", "milky-way-colorado", "montreal-metro",
"most-popular-female-names-in-america", "neil-degrasse-tyson-tweets",
"new-york-city-cinemagraphs", "new-york-subways-1980s", "north-korea-photographs",
"north-korea-photographs?utm_campaign=taboolaINTL&utm_medium=referral&utm_source=taboola",
"north-korea-photographs?utm_medium=referral&utm_source=taboolainternal",
"obama-aging", "pablo-escobar", "pablo-escobar??utm_source=facebook",
"pablo-escobar??utm_source=facebook&sr_source=lift_facebook&utm_campaign=simplereach_pablo-escobar&utm_medium=social",
"pablo-escobar?utm_campaign=whfbpd&utm_medium=social&utm_source=facebook",
"panda-facts", "photo-of-the-day-nasa-releases-crystal-clear-image-of-pluto",
"pollution-in-china-photographs", "pollution-in-china-photographs?utm_campaign=3434&utm_medium=rpages&utm_source=Facebook&utm_term=1a0ddea7bed770d5473c45e9f8d81dfd0c4fdd232f207c6b88b53c41ff220c59",
"pollution-in-china-photographs?utm_campaign=3434&utm_medium=rpages&utm_source=Facebook&utm_term=e28a76c1572c36c3a13965e52b4b2ea10518eb9f9c79c4bc84cfb85db16be81e",
"pollution-in-china-photographs?utm_campaign=6806&utm_medium=rpages&utm_source=Facebook&utm_term=1a0ddea7bed770d5473c45e9f8d81dfd0c4fdd232f207c6b88b53c41ff220c59",
"pollution-in-china-photographs?utm_campaign=7048&utm_medium=rpages&utm_source=Facebook&utm_term=2ef4bd7b6cd587601d6eeb35925282a1ed095ebbd4e9e4c0337ef868c7de7a0b",
"pollution-in-china-photographs?utm_campaign=7458&utm_medium=rpages&utm_source=Facebook&utm_term=b9e79a51cd4daf4c3ec02accce75b3e1fc9a22cb3133460c9c32a4f2f9cdb68c",
"powerful-photos-of-2014", "real-x-files", "romanovs-last-days",
"science-of-human-decay", "scientific-discoveries-2015", "scully-effect",
"serial-killer-quotes", "shah-iran", "six-of-the-craziest-gods-in-mythology",
"space-facts", "sun-facts", "sunken-cities", "sunken-ships",
"super-bowl-i-facts", "superhero-movies", "surreal-places", "syrian-civil-war-photographs",
"the-five-greatest-mysteries-of-human-history", "the-four-most-important-battles-of-ancient-greece",
"the-most-colorful-cities-in-the-world", "titanic-facts", "titanic-facts?utm_campaign=6385&utm_medium=rpages&utm_source=Facebook&utm_term=f5905e878216d14e20457ee3265caf6c10022d9545609edfb9a3cb0642c1a310",
"titanic-facts?utm_campaign=6899&utm_medium=rpages&utm_source=Facebook&utm_term=b9e79a51cd4daf4c3ec02accce75b3e1fc9a22cb3133460c9c32a4f2f9cdb68c",
"titanic-facts?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=106965c54919c24bf37356500ec50f0709b1de621d6950bb4c5d48759ea3677e",
"titanic-facts?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=538659f1fc53f28d2c87b93ac73973681c1a46a04954964ab6c52ed1ab09b33a",
"titanic-facts?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=91eae42c8fc9568103d46e0b6b6ec08fc34fd68b2e1918ffe2333ec73035c95a",
"titanic-facts?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=ab594f0a1be002c8c3db297e8d33b04678af40e6a6469ac815884ae0a014b3a3",
"titanic-facts?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=d1864657a05e5b716bb5cb16a29f068a55652eb39fb669ea9c22a6486198f227",
"titanic-facts?utm_campaign=7292&utm_medium=rpages&utm_source=Facebook&utm_term=f5905e878216d14e20457ee3265caf6c10022d9545609edfb9a3cb0642c1a310",
"us-veterans-portraits", "vintage-disneyland", "wall-street-early-20th-century",
"what-we-love-this-week-the-incredible-last-words-of-famous-historical-figures",
"woodstock-photos", "zombie-proof-house"), class = "factor"),
`0089` = c(0, 0, 0, 0, 0, 1), `0096` = c(0, 0, 0, 0, 0, 0
), `02` = c(0, 0, 0, 0, 0, 0), `0215` = c(0, 0, 0, 0, 0,
0), `0225` = c(0, 0, 0, 0, 0, 0), `0252` = c(0, 0, 0, 0,
0, 0), `0271` = c(0, 0, 0, 0, 0, 0), `0272` = c(0, 0, 0,
0, 0, 0), `03` = c(0, 0, 0, 0, 1, 1)), .Names = c("path",
"0089", "0096", "02", "0215", "0225", "0252", "0271", "0272",
"03"), row.names = c(NA, 6L), class = "data.frame")
and I need to apply the min(x,1) function such that this function scan each value in the dataframe (except first column which is not numeric) and return the min(x,1). that way I have only zero's and one's.
I have tried:
f <- function(x) min(1,x)
res1<-do.call(f,tes[,2:ncol(tes)])
but that does not output the right result.
Any help aapreciated

We can use pmin
tes[,-1] <- pmin(1, as.matrix(tes[,-1]))
Or if we need only binary values
tes[,-1] <- +(!!tes[,-1])

Related

Aggregate similar constructs/ FA with binary variables

I would like to aggregate, in order to reduce the number of constructs, its following data frame containing only binary variables that correspond to "yes/no", its following data frame (first 10 row). The original data frame contains 169 rows.
outcome <-
structure(list(Q9_Automazione.processi = c(0, 0, 0, 0, 0, 0,
1, 1, 1, 0), Q9_Velocita.Prod = c(1, 0, 0, 1, 0, 0, 1, 1, 1,
0), Q9_Flessibilita.Prod = c(0, 0, 0, 1, 0, 0, 1, 1, 0, 1), Q9_Controllo.processi = c(0,
0, 0, 1, 0, 0, 1, 1, 0, 0), Q9_Effic.Magazzino = c(0, 0, 0, 1,
0, 0, 0, 0, 0, 0), Q9_Riduz.Costi = c(0, 1, 0, 0, 0, 0, 0, 0,
0, 1), Q9_Miglior.Sicurezza = c(0, 0, 0, 0, 0, 0, 1, 0, 1, 1),
Q9_Connett.Interna = c(0, 0, 0, 0, 0, 0, 0, 1, 1, 0), Q9_Connett.Esterna = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), Q9_Virtualizzazione = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0), Q9_Innov.Prod = c(0, 0, 0, 0, 0,
1, 0, 0, 0, 1), Q9_Person.Prod = c(0, 1, 0, 1, 0, 1, 0, 0,
0, 1), Q9_Nuovi.Mercati = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
Q9_Nuovi.BM = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Q9_Perform.Energ = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), Q9_Perform.SostAmb = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0)), row.names = c(NA, 10L), class = "data.frame")
I have tried performing factor analysis via the tethracoric method on the obtained correlation matrix ( the obtained value from the KMO function turns out to be inadequate) both directly on the dataframe and then using tethracoric correletions in fafunction (using cor = "tet" I get a negative Tucker Lewis Index).
I have been reading up on this but cannot find a methodology that is adequate and of which I am certain of the correctness of the analysis.
So basically what I would like to achieve is to aggregate similar constructs, e.g., assess whether column 5 has value 1 (i.e., "yes") almost always when column 11 has value 1 and then aggregate.
Here the code that I try to used
library(psych)
tet <- tetrachoric(outcome)
corrplot(tet$rho, "ellipse", tl.cex = 0.75, tl.col = "black")
par(mfrow = c(1,2))
corr_matrix %>%
ggcorrplot(show.diag = F,
type="lower",
lab=TRUE,
lab_size=2)
KMO(corr_matrix)
cortest.bartlett(corr_matrix)
fa.parallel(corr_matrix, fm = "ml")
factor <- fa(corr_matrix, nfactors = 3, rotate = "oblimin", fm = "ml")
print(factor, cut = 0.3, digits = 3)
# -------- Pearson --------
cor(outcome, method = 'pearson', use = "pairwise.complete.obs") %>%
ggcorrplot(show.diag = F,
type="lower",
lab=TRUE,
lab_size=2)
KMO(outcome)
cortest.bartlett(outcome)
fa.parallel(outcome)
factor1 <- fa(outcome, nfactors = 3, rotate = "oblimin", cor = "tet", fm = "ml")
print(factor1, cut = 0.3, digits = 3)

Create bar plot for every level of a factor in a wide format data frame

I'm trying to create a bar plot using ggplot2 and my data is in this format:
dput here:
structure(list(clade = structure(c(1L, 3L, 2L, 3L, 2L, 2L), .Label = c("19A",
"20A", "20B", "20E (EU1)", "20I (Alpha, V1)", "20J (Gamma, V3)",
"21J (Delta)"), class = "factor"), C.T = c(0, 4, 4, 4, 4, 4),
A.G = c(0, 1, 1, 1, 1, 1), G.A = c(0, 2, 0, 2, 0, 0), G.C = c(0,
1, 0, 1, 0, 0), T.C = c(0, 0, 0, 0, 0, 0), C.A = c(0, 0,
0, 0, 0, 0), G.T = c(0, 0, 0, 0, 0, 0), A.T = c(0, 0, 0,
0, 0, 0), T.A = c(0, 0, 0, 0, 0, 0), T.G = c(0, 0, 0, 0,
0, 0), A.C = c(0, 0, 0, 0, 0, 0), C.G = c(0, 0, 0, 0, 0,
0), A.del = c(0, 0, 0, 0, 0, 0), TAT.del = c(0, 0, 0, 0,
0, 0), TCTGGTTTT.del = c(0, 0, 0, 0, 0, 0), TACATG.del = c(0,
0, 0, 0, 0, 0), AGTTCA.del = c(0, 0, 0, 0, 0, 0), GATTTC.del = c(0,
0, 0, 0, 0, 0)), row.names = c(NA, -6L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x0000014b25a51ef0>)
I'd like to create 7 bar plots (one for each "clade") where the X axis would have the columns of the data frame (C.T would be 1 bar, A.G would be another bar, etc) and the Y axis would be the count. Essentially, for each clade, print a barplot with the counts of column.
For example, for the bar plot of the clade "20B" and the bar name "C.T" the count would be the sum of the values from the data frame. Can I do that in this wide format? Do I need to transform the data to a long format instead?
I was trying to apply this SO answer: Plotting error bar on bar chart for a data frame in wide format using ggplot but I keep getting choose another strategy with names_repair
Thank you in advance, any help is very welcome!

Excluding one of the columns from the means calculating

I have a data.frame like this:
> dput(head(dat))
structure(list(`Gene name` = c("at1g01050", "at1g01080", "at1g01090",
"at1g01220", "at1g01320", "at1g01420"), `1_1` = c(0, 0, 0, 0,
0, 0), `1_2` = c(0, 0, 0, 0, 0, 0), `1_3` = c(0, 2.2266502274762,
0, 0, 0, 0), `1_4` = c(0, 1.42835007256373, 0, 0, 0, 0), `1_5` = c(0,
1, 0, 0, 0, 0.680307288653971), `1_6` = c(0, 0.974694551708235,
0.0703315834738149, 0, 0, 1.5411058346636), `1_7` = c(1, 1.06166030205396,
0, 0, 0, 0), `1_8` = c(1, 1.07309874414745, 0.129442847788922,
0, 0, 0), `1_9` = c(1.83566164452602, 0.770848509662441, 1.16522133036595,
1.02360016370994, 0, 0), `1_10` = c(0, 0, 0.96367393959757, 0,
0, 0), `1_11` = c(0, 1, 1.459452636222, 0, 0.992067202742928,
0), `1_12` = c(0, 0, 0.670100384155585, 0, 0.461601636474094,
0), `1_13` = c(0, 0, 1.43074917909221, 0, 1.35246977730244, 0
), `1_14` = c(0, 0, 1.13052717277684, 0, 1.27971261718285, 0),
`1_15` = c(0, 0, 0, 0, 0, 0), `1_16` = c(0, 0, 1.02186950513655,
0, 0.937805171752374, 0), `1_17` = c(0, 0, 0, 0, 1.82226410514639,
0), `1_18` = c(0, 0, 1.2057581396188, 0, 1, 0), `1_19` = c(0,
0, 2.54080080087007, 0, 1.74014162763125, 0), `1_20` = c(0,
0, 0, 0, 0, 0), `1_21` = c(0, 0, 1.85335086627868, 0, 2.93605031878879,
0), `1_22` = c(0, 0, 0, 0, 0, 0), `1_23` = c(0, 0, 0, 0,
0, 0), `1_24` = c(0, 0.59685787388353, 4.74450895485671,
0, 1.64665192735547, 0), `1_25` = c(0, 0, 0, 0, 0, 0), `1_26` = c(0,
0, 0, 0, 0, 0), `1_27` = c(0, 1.70324142554566, 0, 0, 0,
0), `1_28` = c(0, 4.02915818089525, 0, 0, 0, 0), `1_29` = c(0,
1.10050253348262, 0, 0, 0, 1.78705663080963), `1_30` = c(0,
0, 0, 0, 0, 0), `1_31` = c(0.525193634811661, 1.19203674964562,
0, 0, 0, 0), `1_32` = c(0.949695564218912, 0.511935958918944,
0.698256748091399, 0.924419021307232, 0, 0), `1_33` = c(1,
0.392202418854686, 0.981531026331928, 1, 0, 0), `1_34` = c(0,
0, 1.04480642952605, 0, 0, 0), `1_35` = c(0.875709646300199,
0.416787083481068, 0.910412293707794, 0, 0.931813162802324,
0), `1_36` = c(0.235817844851986, 0, 0.695496044366791, 0,
0, 0), `1_37` = c(0, 0, 0, 0, 0, 0), `1_38` = c(0, 0, 0,
0, 0, 0), `1_39` = c(0, 0, 0, 0, 0, 0), `1_40` = c(0, 0.426301584359177,
1.05916031917965, 0, 1.11716924423855, 0), `1_41` = c(0,
0, 0, 0, 0, 0), `1_42` = c(0, 0, 0, 0, 0, 0), `1_43` = c(0,
0, 0, 0, 0, 0), `1_44` = c(0, 0.817605484758179, 1, 0, 1,
0), `1_45` = c(0, 0, 0, 0, 1.83706702696725, 0), `1_46` = c(0,
0, 0, 0, 0, 0), `1_48` = c(0, 0, 0, 0, 0, 0), `1_49` = c(0,
0, 0, 0, 0, 0), `1_50` = c(0, 0, 0, 0, 0, 0), `1_51` = c(0,
0.822966241998042, 0, 0, 0, 0), `1_52` = c(0, 1.38548267401525,
0, 0, 0, 0), `1_53` = c(0, 0.693090058304095, 0, 0, 0, 1.200664746484
), `1_54` = c(0, 7.58136662752864, 0, 0, 0, 0), `1_55` = c(0.519878111919004,
0.530809413647805, 0.343274113384907, 0, 0, 0), `1_56` = c(1.24511715957891,
0.545097856366912, 0.397440073804376, 0, 0, 0), `1_57` = c(1.26748496499576,
0.502893153188496, 1, 1.09278985531586, 0, 0), `1_58` = c(0.696198684496234,
0.68197003689249, 1.30108437738319, 0.778091049180591, 0.533017938104689,
0), `1_59` = c(1.15255606344999, 0.294294436704185, 1.07862692616479,
1, 0.250091116406616, 0), `1_60` = c(1.95634163405497, 0,
1.1602014253913, 0, 0, 0), `1_61` = c(1.09287167009628, 0,
2.05939536537347, 1.08165521287259, 0.68027384701565, 0),
`1_62` = c(0.791776166968497, 0, 0.846107162142824, 0, 0.77013323652256,
0), `1_63` = c(0.378787010943447, 0.391876271945063, 0.623223753921758,
0, 0.651918444771296, 0), `1_64` = c(0.189585762007804, 0.361452381684218,
0.799519726870751, 0, 1.06818683719768, 0), `1_65` = c(0,
0, 2.5212953775211, 0, 0, 0), `1_66` = c(0, 0, 0, 0, 0, 0
), `1_67` = c(0, 0, 0, 0, 2.44827717262786, 0), `1_68` = c(0,
0, 0, 0, 0, 0), `1_69` = c(0, 0, 0, 0, 0, 0), `1_70` = c(0,
0, 2.36142611074334, 0, 2.391093649557, 0), `1_71` = c(0,
0, 0.35565044656798, 0, 0, 0), `1_72` = c(0, 0, 5.86951313801941,
0, 0, 0)), .Names = c("Gene name", "1_1", "1_2", "1_3", "1_4",
"1_5", "1_6", "1_7", "1_8", "1_9", "1_10", "1_11", "1_12", "1_13",
"1_14", "1_15", "1_16", "1_17", "1_18", "1_19", "1_20", "1_21",
"1_22", "1_23", "1_24", "1_25", "1_26", "1_27", "1_28", "1_29",
"1_30", "1_31", "1_32", "1_33", "1_34", "1_35", "1_36", "1_37",
"1_38", "1_39", "1_40", "1_41", "1_42", "1_43", "1_44", "1_45",
"1_46", "1_48", "1_49", "1_50", "1_51", "1_52", "1_53", "1_54",
"1_55", "1_56", "1_57", "1_58", "1_59", "1_60", "1_61", "1_62",
"1_63", "1_64", "1_65", "1_66", "1_67", "1_68", "1_69", "1_70",
"1_71", "1_72"), row.names = c(NA, 6L), class = "data.frame")
That's the code I use for calculation of the mean for 3 replicates which I have in the data frame:
## Calculating the mean of 3 "replicates"
ind <- c(1, 25, 49)
dat2 <- dat[-1]
tbl_end <- cbind(dat[1], sapply(0:23, function(i) rowMeans(dat2[ind+i])))
That's an error which comes:
Error in `[.data.frame`(dat2, ind + i) : undefined columns selected
Called from: eval(substitute(browser(skipCalls = pos), list(pos = 9 - frame)),
envir = sys.frame(frame))
I have 71 columns of results (should be 72 because I have 24 fractions and 3 replicates what gives 72 in total) but there should be one more column. No idea why it's missing but anyway I have to solve it. There is no 1_47 which should come with 1_23 and 1_71. Do you have any idea how can I edit my function to just ignore fraction 1_47 and still get a mean of 1_23 and 1_71 ?
Why not just add in a dummy column for 1_47. That will make your data more regular and make it much easier to extract the indexes you need. To do this, try
dat2<-cbind(dat[1:47], 1_47=rep(NA, nrow(dat)), dat[48:72])
ind <- c(1, 25, 49)
tbl_end <- cbind(dat[1], sapply(0:23, function(i) rowMeans(dat2[ind+i+1], na.rm=T)))

Finding the duplicates, average them and create a proper table

Let's start with my data:
> dput(head(tbl_end))
structure(list(`Gene name` = c("at1g01050.1", "at1g01080.1",
"at1g01090.1", "at1g01220.1", "at1g01320.2", "at1g01420.1"),
`1_1` = c(0, 0, 0, 0, 0, 0), `1_2` = c(0, 0, 0, 0, 0, 0),
`1_3` = c(0, 1, 0, 0, 0, 0), `1_4` = c(0, 0.660693687777888,
0, 0, 0, 0), `1_5` = c(0, 0.521435654491704, 0, 0, 0, 1),
`1_6` = c(0, 0.437291194705566, 0, 0, 0, 1), `1_7` = c(0,
0.52204783488213, 0, 0, 0, 0), `1_8` = c(0, 0.524298383907171,
0, 0, 0, 0), `1_9` = c(1, 0.376865096972469, 0, 1, 0, 0),
`1_10` = c(0, 0, 0, 0, 0, 0), `1_11` = c(0, 0, 0, 0, 0, 0
), `1_12` = c(0, 0, 0, 0, 0, 0), `1_13` = c(0, 0, 0, 0, 0,
0), `1_14` = c(0, 0, 0, 0, 0, 0), `1_15` = c(0, 0, 0, 0,
0, 0), `1_16` = c(0, 0, 0, 0, 0, 0), `1_17` = c(0, 0, 0,
0, 0, 0), `1_18` = c(0, 0, 0.476101907006443, 0, 0, 0), `1_19` = c(0,
0, 1, 0, 0, 0), `1_20` = c(0, 0, 0, 0, 0, 0), `1_21` = c(0,
0, 0, 0, 1, 0), `1_22` = c(0, 0, 0, 0, 0, 0), `1_23` = c(0,
0, 0, 0, 0, 0), `1_24` = c(0, 0, 0, 0, 0, 0)), .Names = c("Gene name",
"1_1", "1_2", "1_3", "1_4", "1_5", "1_6", "1_7", "1_8", "1_9",
"1_10", "1_11", "1_12", "1_13", "1_14", "1_15", "1_16", "1_17",
"1_18", "1_19", "1_20", "1_21", "1_22", "1_23", "1_24"), row.names = c(NA,
6L), class = "data.frame")
so I have more than 2k rows. As a name of the row I set the gene name but there is a problem. Sometimes same gene has a different "models" (so they put the dot after name and the number 1 or 2) but still it's the same gene so I want to find all of those duplicates (same gene name) and average the values in different columns for this gene and just leave the 1 row with the averaged values.
Is it possible to do ?
Just showing some of the gene names I have:
> dput(vec_names)
c("at1g01050.1", "at1g01080.1", "at1g01090.1", "at1g01220.1",
"at1g01320.2", "at1g01420.1", "at1g01470.1", "at1g01800.1", "at1g01910.5",
"at1g01920.2", "at1g01960.1", "at1g01980.1", "at1g02020.2", "at1g02100.2",
"at1g02130.1", "at1g02140.1", "at1g02150.1", "at1g02305.1", "at1g02500.2",
"at1g02560.1", "at1g02780.1", "at1g02880.3", "at1g02920.1", "at1g02930.2",
"at1g03030.1", "at1g03090.2", "at1g03110.1", "at1g03130.1", "at1g03210.1",
"at1g03220.1", "at1g03230.1", "at1g03310.2", "at1g03330.1", "at1g03475.1",
"at1g03630.2", "at1g03680.1", "at1g03870.1", "at1g03900.1", "at1g04080.2",
"at1g04130.1", "at1g04170.1", "at1g04190.1", "at1g04270.2", "at1g04350.1",
"at1g04410.1", "at1g04420.1", "at1g04530.1", "at1g04640.2", "at1g04690.1",
"at1g04750.2", "at1g04810.1", "at1g04850.1", "at1g04870.2", "at1g05010.1",
"at1g05180.1", "at1g05190.1", "at1g05320.3", "at1g05350.1", "at1g05520.1",
"at1g05560.1", "at1g05620.2", "at1g06000.1", "at1g06110.1", "at1g06130.2",
"at1g06290.1", "at1g06410.1", "at1g06550.1", "at1g06560.1", "at1g06570.1",
I think there is a function for that but can't find it.
Using data.table
library(data.table)
dt <- data.table(dat)
dt[, gene_unique := gsub("[.]*", "", dt$Gene)]
cols <- colnames(dt)[2:25]
dt[, lapply(.SD, mean), by = gene_unique, .SDcols = cols]
Using aggregate as suggested in comments
dat$`Gene name` = gsub("[.]*", "", dat$Gene)
aggregate(. ~ `Gene name`, dat, mean)

Plotting all of the rows in different graph - data frame

Propably the code is very simple but I have never tried plotting in R yet.
I would like to have a linear plot for every row and all the plots on different graph.
The number in my data goes from 0 to 1. Value one is the maximum of the plot, in some cases there might be few maximums in a single row. I would like to have a pdf file as an output.
Data:
> dput(head(tbl_end))
structure(list(`NA` = structure(1:6, .Label = c("AT1G01050",
"AT1G01080", "AT1G01090", "AT1G01220", "AT1G01320", "AT1G01420",
"ATCG00800", "ATCG00810", "ATCG00820", "ATCG01090", "ATCG01110",
"ATCG01120", "ATCG01240", "ATCG01300", "ATCG01310", "ATMG01190"
), class = "factor"), `10` = c(0, 0, 0, 0, 0, 0), `20` = c(0,
0, 0, 0, 0, 0), `52.5` = c(0, 1, 0, 0, 0, 0), `81` = c(0, 0.660693687777888,
0, 0, 0, 0), `110` = c(0, 0.521435654491704, 0, 0, 0, 1), `140.5` = c(0,
0.437291194705566, 0, 0, 0, 1), `189` = c(0, 0.52204783488213,
0, 0, 0, 0), `222.5` = c(0, 0.524298383907171, 0, 0, 0, 0), `278` = c(1,
0.376865096972469, 0, 1, 0, 0), `340` = c(0, 0, 0, 0, 0, 0),
`397` = c(0, 0, 0, 0, 0, 0), `453.5` = c(0, 0, 0, 0, 0, 0
), `529` = c(0, 0, 0, 0, 0, 0), `580` = c(0, 0, 0, 0, 0,
0), `630.5` = c(0, 0, 0, 0, 0, 0), `683.5` = c(0, 0, 0, 0,
0, 0), `735.5` = c(0, 0, 0, 0, 0, 0), `784` = c(0, 0, 0.476101907006443,
0, 0, 0), `832` = c(0, 0, 1, 0, 0, 0), `882.5` = c(0, 0,
0, 0, 0, 0), `926.5` = c(0, 0, 0, 0, 1, 0), `973` = c(0,
0, 0, 0, 0, 0), `1108` = c(0, 0, 0, 0, 0, 0), `1200` = c(0,
0, 0, 0, 0, 0)), .Names = c(NA, "10", "20", "52.5", "81",
"110", "140.5", "189", "222.5", "278", "340", "397", "453.5",
"529", "580", "630.5", "683.5", "735.5", "784", "832", "882.5",
"926.5", "973", "1108", "1200"), row.names = c(NA, 6L), class = "data.frame").
Would be great to have a name of the row on the top of each page in pdf.
Here's an example using your dputed data:
# open the pdf file
pdf(file='myfile.pdf')
# since I don't know what values should be on the X axis,
# I'm just using values from 1 to number of y-values
x <- 1:(ncol(tbl_end)-1)
for(i in 1:nrow(tbl_end)){
# plot onto a new pdf page
plot(x=x,y=tbl_end[i,-1],type='b',main=tbl_end[i,1],xlab='X',ylab='Y')
}
# close the pdf file
dev.off()
where the first page is something like this:
If you want to change the style (e.g. lines without the little circles etc.) of the plot, have a look at the documentation.

Resources