Related
community!
I'm trying to run FAMD on a morphology-based dataset with 25 qualitative variables recording the presence and absence of fluorescence on a body part (binary) and six quantitative variables. Furthermore, I have a few supplementary variables such as sex, genus and depth.
First I ran the code for the FAMD on my data set after I had removed all missing values with na.omit():
res.famd1<-FAMD(fluo_famd1,sup.var=c(1,2,28,35),graph=FALSE, ncp=5)
and retrieved a bunch of results like eigenvalues, scree plot etc.
I then tried to plot my qualitative variables within the two dimensions like in this example:
[Example][1]
This is the code I used:
quali.var1 <- get_famd_var(res.famd1, "quali.var")
quali.var1
fviz_famd_var(res.famd1, "quali.var")
Instead of plotting the categories R is plotting decimal numbers I can't explain.
[Missing categories][2]
After this I tried running the FAMD on my data set with missing values using the code given in the package description:
require(missMDA)
res.impute <- imputeFAMD(fluo_famd2, ncp=3)
res.famd2 <- FAMD(fluo_famd2,tab.disj=res.impute$tab.disj,sup.var=c(1,2,28))
When trying to plot the categories now, they do appear in the plot but they are doubled and labelled with _0 and _1.
[doubled categories][3]
My questions are:
Can you identify an obvious mistake? Why would the categories be plotted twice in the graph? Does it have an impact on the overall analysis? Is FAMD suited for a data set like this?
[1]: https://i.stack.imgur.com/8UFlA.png
[2]: https://i.stack.imgur.com/qb3Cz.png
[3]: https://i.stack.imgur.com/O1Dff.png
Please find a subset of my data here:
structure(list(genus = structure(c(5L, 7L, 7L, 7L, 9L, 7L, 7L,
9L, 9L, 7L, 7L, 9L, 7L, 6L, 7L), .Label = c("Cryptochirus",
"Dacryomaia",
"Fizesereneia", "Fungicola", "Hapalocarcinus", "Hiroia",
"Lithoscaptus",
"Neotroglocarcinus", "Opecarcinus", "Pseudohapalocarcinus",
"Xynomaia"
), class = "factor"), sex = structure(c(1L, 1L, 1L, 2L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("f", "m"), class
=
"factor"),
frontal_dorsal = structure(c(1L, 2L, 2L, 2L, 2L, 2L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("0", "1"), class =
"factor"),
frontal_ventral = structure(c(1L, 2L, 2L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("0", "1"), class =
"factor"),
mesogastric = structure(c(1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 1L, 2L, 2L), .Label = c("0", "1"), class =
"factor"),
cardial = structure(c(1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 2L), .Label = c("0", "1"), class = "factor"),
branchial = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L,
1L, 1L, 2L, 1L, 2L, 2L), .Label = c("0", "1"), class = "factor"),
ps1 = structure(c(1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L,
1L, 2L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"),
ps2 = structure(c(1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L,
1L, 2L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"),
ps3 = structure(c(1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L,
1L, 2L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"),
ps4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"),
ps6 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"),
telson = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"),
eyes = structure(c(1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L,
2L, 2L, 1L, 1L, 2L), .Label = c("0", "1"), class = "factor"),
eyestalk = structure(c(1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L,
2L, 1L, 1L, 1L, 1L, 2L), .Label = c("0", "1"), class = "factor"),
antennules = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("0", "1"), class =
"factor"),
anntenullar_peduncle = structure(c(1L, 1L, 2L, 2L, 2L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L), .Label = c("0", "1"), class
=
"factor"),
depth = c(NA, 10.3, 16, 16.1, 14.3, 12.8, 10.8, 12.6, 10.2,
11, 11.9, 13.1, 10.7, 10.1, 12.3), carapace_fluo = c(NA,
NA, 0.0999104660846311, 0.459446596994549, 0.639459602769835,
0.0157309627508303, NA, 0.792912115871697, 0.385646421420439,
0.0934932558564838, 0.118926192063408, 0.334765757290687,
NA, 0.712954991372207, 0.816431146170724), ap_fluo = c(NA,
0, 0.153709650160554, NA, 0.526410945516736, 0,
0.0572985597508758,
NA, 0.0105633802816901, 0.284174213022855, 0.305258467023173,
0.402286503491138, NA, 0, 0.0679211592610398), prod_fluo = c(NA,
0, 0, NA, 0.528576376861794, 0, 0, 0.15260360009031, 0,
0.0252962625341841,
0.241194486983155, 0.0717077570655442, NA, 0.479219143576826,
0), pol_fluo = c(NA, 0, 0, NA, 0, 0, 0, 0.118164567879938,
0, 0, 1, 0, NA, 0.299160251924423, 0), dac_fluo = c(NA, 0,
0, NA, 0, 0, 0, 0.102848534648042, 0, 0, 0.309536216779573,
0, NA, 0.0654761904761905, 0), sum_chel = c(NA, 0, 0, NA,
0.345118733509235, 0, 0, 0.14349725008088, 0, 0.0155266470835082,
0.347599820547331, 0.0451661774453177, NA, 0.32612422524067,
0)), row.names = c(NA, -15L), class = c("tbl_df", "tbl",
"data.frame"))
I am trying to create a new column based on whether a respondent is healthy or not.
Here it the type fo data I have:
test <- structure(list(`cutree(hc_diana, k = 4)` = c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), id = c("117dbbbf15", "117dbbbf15", "117dbbbf15", "117dbbbf15",
"117dbbbf15", "117dbbbf15", "117dbbbf15", "117dbbbf15", "117dbbbf15",
"3c8bfb6fc3", "3c8bfb6fc3", "3c8bfb6fc3", "3c8bfb6fc3", "3c8bfb6fc3",
"3c8bfb6fc3", "3c8bfb6fc3", "3c8bfb6fc3", "3c8bfb6fc3", "8a594e9340",
"8a594e9340"), covid_tested = c("positive", "positive", "positive",
"positive", "positive", "positive", "positive", "positive", "positive",
"positive", "positive", "positive", "positive", "positive", "positive",
"positive", "positive", "positive", "positive", "positive"),
age = c(51, 51, 51, 51, 51, 51, 51, 51, 51, 28, 28, 28, 28,
28, 28, 28, 28, 28, 28, 28), gender = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("Female", "Male", "Other"), class = "factor"),
number_morbidities = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1), chills = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = c("No", "Yes"), class = "factor"), cough = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
diarrhoea = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), fatigue = structure(c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
headache = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), loss_smell_taste = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"),
muscle_ache = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), nasal_congestion = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
nausea_vomiting = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), shortness_breath = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
sore_throat = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), sputum = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = c("No", "Yes"), class = "factor"), temperature = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
loss_appetite = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), chest_pain = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
itchy_eyes = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), joint_pain = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
comorbidities = c("asthma", "diabetes_type_one", "diabetes_type_two",
"obesity", "hypertension", "heart_disease", "lung_condition",
"liver_disease", "kidney_disease", "asthma", "diabetes_type_one",
"diabetes_type_two", "obesity", "hypertension", "heart_disease",
"lung_condition", "liver_disease", "kidney_disease", "asthma",
"diabetes_type_one"), bolean_yes_no = c("No", "No", "No",
"Yes", "No", "No", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No")), row.names = c(NA,
-20L), class = c("tbl_df", "tbl", "data.frame"))
I have 15 rows with 3 unique id's in
Yet, I want to get new column based on several conditions:
if they have a comorbidity -> then select only the row with that comorbidity in question and add it into the new column with its name, yet all the other should have NA
as you can see the second id , does not have whatsoever any comorbidity, therefore I want a new category for it and treat it as a "healthy" category and the rest of the rows pertaining to this patient to appear as NA. This is the same for the third responder.
How do I do this with tidyverse?
A sample of how I want the new column to look like is here, check the last column that summarises the above points.
structure(list(id = c("117dbbbf15", "117dbbbf15", "117dbbbf15",
"117dbbbf15", "117dbbbf15", "117dbbbf15", "117dbbbf15", "117dbbbf15",
"117dbbbf15", "3c8bfb6fc3", "3c8bfb6fc3", "3c8bfb6fc3", "3c8bfb6fc3",
"3c8bfb6fc3", "3c8bfb6fc3", "3c8bfb6fc3", "3c8bfb6fc3", "3c8bfb6fc3",
"8a594e9340", "8a594e9340"), number_morbidities = c(1, 1, 1,
1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1), chills = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), cough = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), diarrhoea = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), fatigue = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), headache = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), loss_smell_taste = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"), muscle_ache = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), nasal_congestion = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), nausea_vomiting = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), shortness_breath = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), sore_throat = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), sputum = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), temperature = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), loss_appetite = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), chest_pain = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), itchy_eyes = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), joint_pain = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"), comorbidities = c("asthma",
"diabetes_type_one", "diabetes_type_two", "obesity", "hypertension",
"heart_disease", "lung_condition", "liver_disease", "kidney_disease",
"asthma", "diabetes_type_one", "diabetes_type_two", "obesity",
"hypertension", "heart_disease", "lung_condition", "liver_disease",
"kidney_disease", "asthma", "diabetes_type_one"), bolean_yes_no = c("No",
"No", "No", "Yes", "No", "No", "No", "No", "No", "No", "No",
"No", "No", "No", "No", "No", "No", "No", "No", "No"), morbiditiy_healthy = c(NA,
NA, NA, "obesity", NA, NA, NA, NA, NA, "healthy", NA, NA, NA,
NA, NA, NA, NA, NA, "healthy", NA)), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))
We group by 'id', create the 'morbidity_healthy' with case_when where we check for 'Yes' in 'bolean_yes_no' column, if it is TRUE, then get the corresponding 'comorbidities', and if there are not (!) any 'Yes' and the row_number is 1, then return the 'healthy' for that row
library(dplyr)
test %>%
group_by(id) %>%
mutate(morbidity_healthy = case_when(bolean_yes_no == 'Yes' ~ comorbidities,
(!any(bolean_yes_no == 'Yes')) & row_number()==1 ~ 'healthy'))
I have a data frame with different variables (columns).
I want to transform this data frame into a table with a different structure to make it more readable.
For example, I have a data frame like this:
myData = structure(list(X = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "20", class = "factor"),
Y = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), .Label = c("20", "100"), class = "factor"),
MethodType = structure(c(2L, 2L, 4L, 4L, 1L, 1L, 3L, 3L,
2L, 2L, 4L, 4L, 1L, 1L, 3L, 3L), .Label = c("E", "Q", "R",
"W"), class = "factor"), MethodType2 = structure(c(1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("A",
"B"), class = "factor"), Metric1 = c(0.970017512487058, 0.969647220975651,
0.965873991040769, 0.966242788535318, 0.986725852301671,
0.98696657967457, 0.98252107117733, 0.982655296614757, 0.278826941542694,
-0.990926101696033, 0.194574672498287, 0.281916524368647,
0.152983364411985, 1.44135982835554, 0.330270447575806, -0.369627160641594
), Metric2 = c(0.987541353383459, 0.987007518796992, 0.980984962406015,
0.981646616541353, 0.984082706766917, 0.984481203007519,
0.988165413533835, 0.988375939849624, -0.109331599015822,
-0.148471161609603, 1.31331396089969, -1.34238564643737,
2.14014350779371, -0.422879539464588, -1.25706359685425,
1.09603324772565)), row.names = c(NA, -16L), class = "data.frame")
and I want to have a table like this:
Which kind of manipulation I can use? Which tool I can use. I'm looking for something flexible that can work also with more factors.
I have a dataset I downloaded from The Human Protein Atlas which has annotations for the subcellular localization of 12,004 proteins. This file I've subset to only include "Gene name" and then 4 columns for how reliable that location is (based on immunofluorescently stained cells). Theses are "Validated">"Supported">"Approved">"Uncertain".
I've came up with a scoring system I would like to apply to LC-MS spectral count dataset I have by 1) weighing the quality of annotation and 2) penalizing how many locations the protein is found in image of proposed scoring system.
The TLDR is that I need to count how many terms there is in each column of the following data set and get a dataframe of this information.
df <- read.csv("proteinAtlas.csv")
dput(df)
structure(list(Gene_symbol = structure(1:49, .Label = c("AAAS",
"AAMP", "AAR2", "AARD", "AARS", "AARS2", "AARSD1", "ABCA13",
"ABCB6", "ABCB7", "ABCB8", "ABCC1", "ABCC4", "ABCD3", "ABCE1",
"ABCF1", "ABCF2", "ABCF3", "ABHD10", "ABHD14B", "ABHD6", "ABI1",
"ABI2", "ABL2", "ACAA1", "ACAA2", "ACACA", "ACAD9", "ACADM",
"ACADS", "ACADVL", "ACAP1", "ACAP2", "ACAT1", "ACAT2", "ACBD3",
"ACBD5", "ACIN1", "ACLY", "ACO2", "ACOT1", "ACOT13", "ACOT2",
"ACOT7", "ACOT8", "ACOT9", "ACOX1", "ACP1", "ACP5"), class = "factor"),
Validated = structure(c(1L, 2L, 1L, 1L, 2L, 4L, 1L, 1L, 3L,
1L, 1L, 1L, 1L, 5L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
5L, 1L, 1L, 4L, 4L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 5L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 6L, 1L, 1L), .Label = c("", "Cytosol",
"Golgi apparatus", "Mitochondria", "Peroxisomes", "Vesicles"
), class = "factor"), Supported = structure(c(1L, 9L, 1L,
1L, 1L, 1L, 1L, 1L, 5L, 10L, 10L, 12L, 1L, 1L, 1L, 1L, 4L,
1L, 1L, 6L, 1L, 3L, 1L, 11L, 1L, 10L, 2L, 1L, 1L, 10L, 10L,
1L, 1L, 1L, 4L, 8L, 1L, 11L, 7L, 10L, 1L, 1L, 1L, 4L, 13L,
1L, 1L, 1L, 1L), .Label = c("", "Actin filaments;Cytosol",
"Cell Junctions;Plasma membrane", "Cytosol", "Cytosol;Mitochondria;Nucleoplasm;Plasma membrane",
"Cytosol;Nucleoli;Nucleus", "Cytosol;Nucleoplasm;Plasma membrane",
"Golgi apparatus", "Microtubules", "Mitochondria", "Nucleoplasm",
"Plasma membrane", "Vesicles"), class = "factor"), Approved = structure(c(3L,
1L, 5L, 12L, 1L, 1L, 6L, 4L, 1L, 1L, 17L, 1L, 8L, 1L, 1L,
1L, 1L, 7L, 13L, 1L, 16L, 1L, 15L, 1L, 1L, 1L, 14L, 1L, 1L,
15L, 17L, 18L, 11L, 1L, 17L, 1L, 1L, 1L, 1L, 1L, 13L, 2L,
13L, 15L, 13L, 9L, 17L, 10L, 5L), .Label = c("", "Cell Junctions",
"Centrosome;Cytosol;Nuclear membrane", "Centrosome;Cytosol;Vesicles",
"Cytosol", "Cytosol;Nuclear membrane", "Cytosol;Nucleoli",
"Cytosol;Nucleoli;Plasma membrane", "Cytosol;Nucleoplasm;Plasma membrane",
"Cytosol;Nucleus", "Endosomes", "Lipid droplets", "Mitochondria",
"Nucleoli fibrillar center", "Nucleoplasm", "Nucleoplasm;Vesicles",
"Nucleus", "Vesicles"), class = "factor"), Uncertain = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("", "Cytosol;Plasma membrane", "Nucleoli"
), class = "factor")), .Names = c("Gene_symbol", "Validated",
"Supported", "Approved", "Uncertain"), class = "data.frame", row.names = c(NA,
-49L))
So the ideal output would look like this figure or, if you prefer, dput():
structure(list(Gene_symbol = structure(1:29, .Label = c("AAAS",
"AAMP", "AAR2", "AARD", "AARS", "AARS2", "AARSD1", "ABCA13",
"ABCB6", "ABCB7", "ABCB8", "ABCC1", "ABCC4", "ABCD3", "ABCE1",
"ABCF1", "ABCF2", "ABCF3", "ABHD10", "ABHD14B", "ABHD6", "ABI1",
"ABI2", "ABL2", "ACAA1", "ACAA2", "ACACA", "ACAD9", "ACADM"), class = "factor"),
Validated = c(NA, 1L, NA, NA, 1L, 1L, NA, NA, 1L, NA, NA,
NA, NA, 1L, 1L, 1L, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA,
NA, 1L, 1L), Supported = c(NA, 1L, NA, NA, NA, NA, NA, NA,
4L, 1L, 1L, 1L, NA, NA, NA, NA, 1L, NA, NA, 3L, NA, 2L, NA,
1L, NA, 1L, 2L, NA, NA), Approved = c(3L, NA, 1L, 1L, NA,
NA, 2L, 3L, NA, NA, 1L, NA, 3L, NA, NA, NA, NA, 2L, 1L, NA,
2L, NA, 1L, NA, NA, NA, 1L, NA, NA), Uncertain = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("Gene_symbol",
"Validated", "Supported", "Approved", "Uncertain"), class = "data.frame", row.names = c(NA,
-29L))
For the most part in each column it's a string separated by ";" however, in some cases their are terms like "Nucleoli fibrillar center" or "Lipid droplets" which are separated by spaces and should be counted as one word/term
I've found examples of counting the number of words in a string in R where:
d <- "foo,bar,fun"
length(strsplit(d,",")[[1]]
class(d)
But this only works on the "character" class and not "data.frame".
Can anyone suggest how to do this in R?
Many thanks!
We can use str_count. Loop over the columns except the first one (lapply(df[-1], ..), get the count of ; add 1 to it, check for cases where there is empty string and replace those elements with NA
library(stringr)
df[-1] <- lapply(df[-1], function(x) (str_count(x, ";") + 1) * NA^(as.character(x) == ""))
A solution using base:
result_df <- data.frame(t(apply(df,1,function(x){
c(x[1],sapply(strsplit(as.character(x[-1]),";"),length))
})), stringsAsFactors = F)
names(result_df) <- c("Gene_symbol", "Validated", "Supported", "Approved", "Uncertain")
I have a very large data frame (nrow=~273,000) which I've subset as an example below: Each row is a protein name(s) and has various numbers of columns that lists the subcellular structures in which they can be found in human cells. 1) I would like to remove duplicate entries for each row and am struggling with this (code below). 2) I would then like to be able to count how many columns (subcellular structures) each gene can be found in.
Background: I got this data from Uniprot and cleaned it up as best as I could using regex but there are still some cases where there are rows with duplicate entries (e.g. FMR1 lists Chromosome 2x, Cytoplasm 3x and Plasma Membrane 2x - furthermore there are some blank columns in between them)
dput(df1)
structure(list(FMR1 = structure(c(41L, 3L, 17L, 63L, 16L, 24L,
35L, 33L, 52L, 6L, 49L, 5L, 71L, 72L, 42L, 58L, 22L, 20L, 19L,
80L, 9L, 51L, 66L, 64L, 23L, 14L, 60L, 45L, 28L, 54L, 7L, 30L,
29L, 44L, 53L, 8L, 69L, 79L, 10L, 11L, 26L, 37L, 39L, 40L, 82L,
73L, 18L, 21L, 27L, 47L, 4L, 46L, 1L, 13L, 36L, 70L, 74L, 67L,
78L, 77L, 61L, 62L, 31L, 56L, 34L, 57L, 25L, 81L, 75L, 59L, 2L,
65L, 55L, 38L, 50L, 68L, 32L, 12L, 43L, 15L, 48L, 76L), .Label = c("AAMP",
"ADCY10 SAC", "AIMP1 EMAP2 SCYE1", "ANTXR2 CMG2", "APBB1 FE65 RIR",
"APC DP2", "APLP1", "ARHGAP26 GRAF KIAA0621 OPHN1L", "ARL4A ARL4",
"ATP6V0D1 ATP6D VPATPD", "ATP6V1D ATP6M VATD", "AZIN2 ADC KIAA1945 ODCP",
"CACNB2 CACNLB2 MYSB", "CAMK2D CAMKD", "CDCA8 PESCRG3", "CDK1 CDC2 CDC28A CDKN1 P34CDC2",
"CEMIP KIAA1199", "CIB1 CIB KIP PRKDCIP", "CLTA", "CLTB", "CMTM8 CKLFSF8",
"DMD", "DSP", "ECT2", "EHD2 PAST2", "ENTPD2 CD39L1", "ERBB2 HER2 MLN19 NEU NGL",
"EVPL", "FCHO1 KIAA0290", "FCHO2", "FGR SRC2", "GPER1 CEPR CMKRL2 DRY12 GPER GPR30",
"HDAC6 KIAA0901 JM21", "ITCH", "ITGB1BP1 ICAP1", "KCTD7", "KIFC3",
"MFN1", "MISP C19orf21", "MYOT TTID", "NGDN C14orf120", "NISCH IRAS KIAA0975",
"NR1D1 EAR1 HREV THRAL", "PGM5 PGMRP", "PKP4", "PLA2G6 PLPLA9",
"PNKD KIAA1184 MR1 TAHCCP2 FKSG19 UNQ2491/PRO5778", "POP7 RPP20",
"PPL KIAA0568", "PRDX3 AOP1", "PTOV1 ACID2 PP642 UNQ6127/PRO20092",
"PTPN23 KIAA1471", "PTPRE", "PTPRR ECPTP PTPRQ", "RAB13 GIG4",
"RAB23 HSPC137", "RAB29 RAB7L1", "RAB30", "RAB38", "RAB40AL RLGP",
"RAB8A MEL RAB8", "RAB9A RAB9", "RACGAP1 KIAA1478 MGCRACGAP",
"RAP1B OK/SW-cl", "RGS8", "RPSA LAMBR LAMR1", "SGIP1", "SHMT2",
"SHROOM3 KIAA1481 SHRML MSTP013", "SLC28A3 CNT3", "SNTA1 SNT1",
"SNTB1 SNT2B1", "SNX11", "SNX12", "STOM BND7 EPB72", "TEX10 L18 Nbla10363",
"TNFRSF8 CD30 D1S166E", "TNS4 CTEN PP14434", "TRIM72 MG53", "USP6 HRP1 TRE2",
"VCL", "YES1 YES"), class = "factor"), Nucleus = structure(c(3L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("Mitochondrion ", "Nucleus", "Nucleus ", "Plasma membrane",
"Plasma membrane "), class = "factor"), Chromosome = structure(c(1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L), .Label = c("Chromosome", "Cytoplasm", "Cytoplasm "), class = "factor"),
Chromosome.1 = structure(c(4L, 5L, 7L, 5L, 14L, 12L, 20L,
18L, 5L, 20L, 20L, 2L, 1L, 1L, 8L, 10L, 19L, 1L, 1L, 8L,
16L, 16L, 17L, 19L, 20L, 21L, 15L, 13L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 9L, 10L,
16L, 16L, 16L, 22L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 11L,
7L, 14L, 9L, 17L, 11L, 9L, 2L, 6L, 6L, 17L, 18L, 10L, 1L,
1L, 17L, 19L, 19L, 1L, 3L, 5L, 1L), .Label = c("", " ", "Chromosome",
"Cytoplasm ", "Cytoplasmic vesicle", "Cytoplasmic vesicle ",
"Endoplasmic reticulum", "Endosome", "Endosome ", "Golgi apparatus",
"Golgi apparatus ", "Midbody", "Midbody ", "Mitochondrion",
"Mitochondrion ", "Nucleus", "Nucleus ", "Perikaryon ",
"Plasma membrane", "Plasma membrane ", "Sarcoplasmic reticulum ",
"Secreted"), class = "factor"), Cytoplasm = structure(c(1L,
15L, 13L, 10L, 1L, 13L, 1L, 1L, 5L, 2L, 11L, 1L, 1L, 1L,
5L, 8L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 14L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 5L, 9L, 2L, 3L, 6L, 7L, 2L, 1L, 2L, 4L, 11L, 12L,
5L, 1L, 1L, 1L, 7L, 3L, 1L, 2L, 2L, 2L), .Label = c("", " ",
"Cytoplasmic vesicle", "Endoplasmic reticulum", "Endosome",
"Endosome ", "Golgi apparatus", "Golgi apparatus ", "Golgi appartus",
"Midbody", "Mitochondrion ", "Nucleus ", "Plasma membrane",
"Plasma membrane ", "Secreted "), class = "factor"), Cytoplasm.1 = structure(c(1L,
4L, 7L, 7L, 1L, 1L, 1L, 1L, 5L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
6L, 3L, 2L, 1L, 1L, 1L), .Label = c("", " ", "Endoplasmic reticulum",
"Endoplasmic reticulum ", "Endosome", "Mitochondrion", "Plasma membrane"
), class = "factor"), Cytoplasmic.vesicle = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 1L), .Label = c("", "Golgi apparatus"
), class = "factor"), Perikaryon = structure(c(2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L,
1L, 1L, 1L, 1L), .Label = c("", " ", "Golgi apparatus"), class = "factor"),
X = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L), .Label = c("",
"Cytoplasmic granule"), class = "factor"), X.1 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 1L), .Label = c("", "Perikaryon"), class = "factor"),
X.2 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA), X.3 = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), Plasma.membrane = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA), Plasma.membrane.1 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
)), .Names = c("FMR1", "Nucleus", "Chromosome", "Chromosome.1",
"Cytoplasm", "Cytoplasm.1", "Cytoplasmic.vesicle", "Perikaryon",
"X", "X.1", "X.2", "X.3", "Plasma.membrane", "Plasma.membrane.1"
), class = "data.frame", row.names = c(NA, -82L))
I've tried getting only unique columns for each row with no luck, for example:
unique(df1) # Original data with repeats removed
dplyr::distinct(df1) # Retain only unique/distinct rows from an input tb
I think the problem is that the above functions are looking for row names which are identical which is not what I want. I want distinct columns for each row. I was thinking of using the melt function but since there is an odd number of columns for each row this won't work.
I would expect the output to look like this newDF
structure(list(FMR1 = structure(c(7L, 1L, 3L, 9L, 2L, 4L, 6L,
5L, 8L), .Label = c("AIMP1 EMAP2 SCYE1", "CDK1 CDC2 CDC28A CDKN1 P34CDC2",
"CEMIP KIAA1199", "ECT2", "HDAC6 KIAA0901 JM21", "ITGB1BP1 ICAP1",
"NGDN C14orf120", "PTPN23 KIAA1471", "RACGAP1 KIAA1478 MGCRACGAP"
), class = "factor"), Nucleus = structure(c(2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("Nucleus", "Nucleus "), class = "factor"),
Chromosome = structure(c(1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("Chromosome", "Cytoplasm"), class = "factor"),
Cytoplasmic.vesicle = structure(c(1L, 8L, 2L, 4L, 5L, 4L,
7L, 6L, 3L), .Label = c("Cytoplasm ", "Endoplasmic reticulum",
"Endosome", "Midbody", "Mitochondrion", "Perikaryon ", "Plasma membrane ",
"Secreted "), class = "factor"), Perikaryon = structure(c(1L,
2L, 3L, 3L, 1L, 3L, 1L, 1L, 1L), .Label = c("", "Endoplasmic reticulum ",
"Plasma membrane"), class = "factor"), Plasma.membrane = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("FMR1", "Nucleus",
"Chromosome", "Cytoplasmic.vesicle", "Perikaryon", "Plasma.membrane"
), class = "data.frame", row.names = c(NA, -9L))
From here I would like to get a rowSums(df1) so I was thinking of coercing each term to a number (e.g. Cytoplasmic vesicle=1, Nucleus=1, Endoplasmic reticiulum=1, etc.) but run into a problem on this dummy-dataset.
df2 <- as.numeric(newDF)
Error: (list) object cannot be coerced to type 'double'
df2 <- as.numeric(newDF[,2:n])
Error in 2:n : NA/NaN argument
Thank you for your help.
EDIT
I would like to get a count for each row of how many unique columns in the newDF like so:
FMR1 5
NGDN C14orf120 3
AIMP1 EMAP2 SCYE1 4
CEMIP KIAA1199 4
RACGAP1 KIAA1478 MGCRACGAP 4
CDK1 CDC2 CDC28A CDKN1 P34CDC2 3
ECT2 4
ITGB1BP1 ICAP1 3
HDAC6 KIAA0901 JM21 3
PTPN23 KIAA1471 3
This may be one way to go. Since your expected result is a character vector, I cannot visualize the final output. Yet, you said you want to check how many columns each protein appears in in the data. I hope the outcome I have is what you are after.
First, I converted all columns to character. Then, I converted the data to long format one using gather(). For each subcellular structure group (i.e., subcellular), I added row indices (e.g., 1 means the 1st row in your original data), and trim white space. Then, remove any rows with NA in protein. Remove any rows with "" and " ". Now tidying up is done. For each row (i.e., row.index), remove duplicated protein types.
Ungroup the data, and finally count how many columns each protein appears (i.e., sucellular structure). Basically, you want to count how many times each protein appear in the data set by this time.
With your sample data, I got the following result. But I am not sure if this is what you want. (I am off to bed now. So I cannot help you for some hours. If anybody can jump in, please do so.)
mutate_all(mydf, as.character) %>%
gather(key = subcellular, value = protein) %>%
group_by(subcellular) %>%
mutate(row.index = 1:n(),
protein = trimws(protein)) %>%
filter(!is.na(protein)) %>%
filter(!protein %in% c("", " ")) %>%
group_by(row.index) %>%
filter(!duplicated(protein)) %>%
ungroup %>%
count(protein, sort = TRUE)
# protein n
# <chr> <int>
# 1 Cytoplasm 82
# 2 Plasma membrane 70
# 3 Nucleus 25
# 4 Endosome 9
# 5 Mitochondrion 9
# 6 Cytoplasmic vesicle 8
# 7 Golgi apparatus 7
# 8 Endoplasmic reticulum 5
# 9 Midbody 3
#10 Perikaryon 3
# ... with 87 more rows
Given jjl"s comment, I did the following. Instead of counting how many columns each protein appears in, I counted how many protein names exist for each row.
mutate_all(mydf, as.character) %>%
gather(key = subcellular, value = protein) %>%
group_by(subcellular) %>%
mutate(row.index = 1:n(),
protein = trimws(protein)) %>%
filter(!is.na(protein)) %>%
filter(!protein %in% c("", " ")) %>%
group_by(row.index) %>%
filter(!duplicated(protein)) %>%
ungroup %>%
count(row.index)
# row.index n
# <int> <int>
# 1 1 4
# 2 2 6
# 3 3 5
# 4 4 6
# 5 5 4
# 6 6 5
# 7 7 4
# 8 8 4
# 9 9 5
#10 10 3
# ... with 72 more rows
EDIT
If you want to remove the 1st column (i.e, FMR1), you can do that by filtering that column. I added filter(subcellular != "FMR1") to my code before I used count() at the end.
mutate_all(mydf, as.character) %>%
gather(key = subcellular, value = protein) %>%
group_by(subcellular) %>%
mutate(row.index = 1:n(),
protein = trimws(protein)) %>%
filter(!is.na(protein)) %>%
filter(!protein %in% c("", " ")) %>%
group_by(row.index) %>%
filter(!duplicated(protein)) %>%
ungroup %>%
filter(subcellular != "FMR1") %>%
count(row.index)
# A tibble: 9 x 2
# row.index n
# <int> <int>
#1 1 3
#2 2 4
#3 3 4
#4 4 4
#5 5 3
#6 6 4
#7 7 3
#8 8 3
#9 9 3