Combining dataframes with missing values - r
I have several dataframes with data from the same survey. I want to combine them for analysis. The dataframes contain both unique variables and two variables (ID and Contest_no) that are shared across all the dataframes; the two shared variables contain information about the respondent and the contest number (1,2,3, as respondents were asked the same questions three times).
The difficulty is that the dataframes have missing values:
DF1 <- data.frame(V1 = factor(c("A", "B", "C", "D")),
V2 = factor(c("A", "B", "C", "D")),
ID = factor(c("x1", "x1", "y2", "y2")),
Contest_no = factor(c("1", "2", "1", "2")))
DF2 <- data.frame(V3 = factor(c("A", "C", "D")),
V4 = factor(c("A", "C", "D")),
ID = factor(c("x1", "y2", "y2")),
Contest_no = factor(c("1", "1", "2")))
DF3 <- data.frame(V5 = factor(c("A", "B", "C")),
V6 = factor(c("A", "B", "C")),
ID = factor(c("x1", "x1", "y2")),
Contest_no = factor(c("1", "2", "1")))
As a result, respondent IDs and contest numbers aren't aligned. I want to match the data to respondent IDS and contest numbers so that the merged dataframe looks like this:
DF_merged <- data.frame(V1 = factor(c("A", "B", "C", "D")),
V2 = factor(c("A", "B", "C", "D")),
V3 = factor(c("A", NA, "C", "D")),
V4 = factor(c("A", NA, "C", "D")),
V5 = factor(c("A", "B", "C", NA)),
V6 = factor(c("A", "B", "C", NA)),
ID = factor(c("x1", "x1", "y2", "y2")),
Contest_no = factor(c("1", "2", "1", "2")))
I thought that full_join would do the trick, but DF_merged <- full_join(DF1, DF2, DF3, by="ID") gives me nonsensical results.
How can disparate data like this be combined?
New, updated example (to address the problem of multiplied rows). In this example there are no missing values at all, and both dataframes have the same number of rows, but the code results in multiplied rows. First, the two dataframes to be merged:
df1:
structure(list(ID = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L), .Label = c("EE1", "EE101", "EE102"), class = "factor"),
Contest_no = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 2L, 2L, 3L,
3L), Option = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L), .Label = c("Option1", "Option2"), class = "factor"),
Chosen_option = c(0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L,
0L, 1L), Combination = structure(c(5L, 5L, 6L, 6L, 4L, 4L,
2L, 2L, 1L, 1L, 3L, 3L), .Label = c("V133", "V181", "V234",
"V252", "V32", "V67"), class = "factor"), Attribute1 = structure(c(1L,
1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L), .Label = c("has strong ties to the government",
"has weak ties to the government"), class = "factor"), Attribute2 = structure(c(1L,
2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L), .Label = c("has strong ties to the local pastoralist community",
"has weak ties to the local pastoralist community"), class = "factor"),
Attribute3 = structure(c(2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L,
2L, 1L, 1L, 2L), .Label = c("is poor", "is wealthy"), class = "factor"),
Attribute4 = structure(c(2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L,
1L, 2L, 2L, 2L), .Label = c("has attained a high level of formal education (for example university degree)",
"has not attained a high level of formal education (for example never went to school or only attended primary school)"
), class = "factor")), .Names = c("ID", "Contest_no", "Option",
"Chosen_option", "Combination", "Attribute1", "Attribute2", "Attribute3",
"Attribute4"), class = "data.frame", row.names = c(NA, -12L))
df2:
structure(list(ID = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L), .Label = c("EE1", "EE101", "EE102"), class = "factor"),
Contest_no = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 2L, 2L, 3L,
3L), Option = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L), .Label = c("Option1", "Option2"), class = "factor"),
Chosen_option = c(1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L,
0L, 1L), Combination = structure(c(6L, 6L, 4L, 4L, 1L, 1L,
3L, 3L, 5L, 5L, 2L, 2L), .Label = c("V150", "V249", "V252",
"V29", "V56", "V77"), class = "factor"), Attribute1 = structure(c(2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("has strong ties to the government",
"has weak ties to the government"), class = "factor"), Attribute2 = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("has strong ties to the local pastoralist community",
"has weak ties to the local pastoralist community"), class = "factor"),
Attribute3 = structure(c(2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L,
2L, 1L, 1L, 2L), .Label = c("is poor", "is wealthy"), class = "factor"),
Attribute4 = structure(c(2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L,
1L, 1L, 2L, 2L), .Label = c("has attained a high level of formal education (for example university degree)",
"has not attained a high level of formal education (for example never went to school or only attended primary school)"
), class = "factor")), .Names = c("ID", "Contest_no", "Option",
"Chosen_option", "Combination", "Attribute1", "Attribute2", "Attribute3",
"Attribute4"), class = "data.frame", row.names = c(NA, -12L))
and now the unsuccessful attempt to combine the two dataframes:
df_merge_attempt <- dplyr::full_join(df1, df2, by=c("ID","Contest_no"))
results in:
structure(list(ID = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L
), .Label = c("EE1", "EE101", "EE102"), class = "factor"), Contest_no = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L), Option.x = structure(c(1L, 1L, 2L,
2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L,
2L, 1L, 1L, 2L, 2L), .Label = c("Option1", "Option2"), class = "factor"),
Chosen_option.x = c(0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L,
1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L),
Combination.x = structure(c(5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L,
4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 3L, 3L, 3L,
3L), .Label = c("V133", "V181", "V234", "V252", "V32", "V67"
), class = "factor"), Attribute1.x = structure(c(1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L,
2L, 2L, 2L, 1L, 1L, 2L, 2L), .Label = c("has strong ties to the government",
"has weak ties to the government"), class = "factor"), Attribute2.x = structure(c(1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L), .Label = c("has strong ties to the local pastoralist community",
"has weak ties to the local pastoralist community"), class = "factor"),
Attribute3.x = structure(c(2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L,
2L), .Label = c("is poor", "is wealthy"), class = "factor"),
Attribute4.x = structure(c(2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("has attained a high level of formal education (for example university degree)",
"has not attained a high level of formal education (for example never went to school or only attended primary school)"
), class = "factor"), Option.y = structure(c(1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L), .Label = c("Option1", "Option2"), class = "factor"),
Chosen_option.y = c(1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L,
1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L),
Combination.y = structure(c(6L, 6L, 6L, 6L, 4L, 4L, 4L, 4L,
1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 5L, 5L, 5L, 5L, 2L, 2L, 2L,
2L), .Label = c("V150", "V249", "V252", "V29", "V56", "V77"
), class = "factor"), Attribute1.y = structure(c(2L, 2L,
2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 2L, 1L), .Label = c("has strong ties to the government",
"has weak ties to the government"), class = "factor"), Attribute2.y = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L), .Label = c("has strong ties to the local pastoralist community",
"has weak ties to the local pastoralist community"), class = "factor"),
Attribute3.y = structure(c(2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L,
2L), .Label = c("is poor", "is wealthy"), class = "factor"),
Attribute4.y = structure(c(2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L), .Label = c("has attained a high level of formal education (for example university degree)",
"has not attained a high level of formal education (for example never went to school or only attended primary school)"
), class = "factor")), class = "data.frame", row.names = c(NA,
-24L), .Names = c("ID", "Contest_no", "Option.x", "Chosen_option.x",
"Combination.x", "Attribute1.x", "Attribute2.x", "Attribute3.x",
"Attribute4.x", "Option.y", "Chosen_option.y", "Combination.y",
"Attribute1.y", "Attribute2.y", "Attribute3.y", "Attribute4.y"
))
You can try dplyr::full_join with by=c("ID","Contest_no") argument as:
library(dplyr)
df1 <- full_join(DF1, DF2, by=c("ID","Contest_no")) %>%
full_join(DF3, by=c("ID","Contest_no"))
df1
# V1 V2 V3 V4 V5 V6 ID Contest_no
#1 A A A A A A x1 1
#2 B B <NA> <NA> B B x1 2
#3 C C C C C C y2 1
#4 D D D D <NA> <NA> y2 2
Updated: Answer has been modified to consider another column Option in full_join as:
df1 <- full_join(DF1, DF2, by=c("ID","Contest_no", "Option"))
Note: I had to tweak my dplyr to match what is suggested by #Gregor in order to get expected result.
Related
Error in Anova.III.lm(mod, error, singular.ok = singular.ok, ...) : there are aliased coefficients in the model
For my experiment, I have 3 independent variables: trial type, sex and gaming experience (all of which are categorical). I have one dependent variable: proportion of correct trials (which is continuous). When I tried running a 3-way ANOVA, the assumptions were not met, and so I used an aligned-rank transformation ANOVA. m1 <- art(Proportioncorrect ~ Videogamefrequency + Biologicalsex + + Trialtype + Videogamefrequency:Biologicalsex + Videogamefrequency:Trialtype + Biologicalsex:Trialtype + Biologicalsex:Trialtype:Videogamefrequency, data = Gaming) The model gave me the error: Error in Anova.III.lm(mod, error, singular.ok = singular.ok, ...) : there are aliased coefficients in the model Could anyone give me a helping hand? My data is here: structure(list(ID = c("P_200214123342", "P_200224092247", "P_200219163622", "P_200220130332", "P_200219091823", "P_200225184226", "P_200219123120", "P_200219175102", "P_200214103155", "P_200219111605", "P_200217101213", "P_200219102411", "P_200221101028", "P_200220145557", "P_200225171612", "P_200224092247", "P_200219163622", "P_200220130332", "P_200214123342", "P_200219091823", "P_200225184226", "P_200219123120", "P_200219175102", "P_200214103155", "P_200219111605", "P_200217101213", "P_200219102411", "P_200221101028", "P_200220145557", "P_200225171612"), Trialtype = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Beaconed", "Probe"), class = "factor"), Proportioncorrect = c(0.729727660699102, 1.33933990048532, 0.729727660699102, 1.075862200454, 0.578378233982015, 1.16808048521424, 1.33933990048532, 1.13531397797248, 1.28700221758657, 1.13531397797248, 1.28700221758657, 1.13531397797248, 1.28700221758657, 1.28700221758657, 1.20358829695229, 0.297711691252463, 0.160690652951911, 0.147197653346961, 0.0667161517509908, 0.080085580033659, 0.160690652951911, 0.133731586046578, 0.214985569478799, 0.160690652951911, 0.269932799291976, 0.339836905918588, 0.242365851038963, 0.214985569478799, 0.677268408841807, 1.20358829695229), Videogamefrequency = structure(c(2L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L), .Label = c("Monthly", "Never", "Weekly", "Yearly"), class = "factor"), Biologicalsex = structure(c(1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L), .Label = c("Female", "Male"), class = "factor")), row.names = c(NA, -30L), class = "data.frame")
Transform a data frame into a table with option
I have a data frame with different variables (columns). I want to transform this data frame into a table with a different structure to make it more readable. For example, I have a data frame like this: myData = structure(list(X = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "20", class = "factor"), Y = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("20", "100"), class = "factor"), MethodType = structure(c(2L, 2L, 4L, 4L, 1L, 1L, 3L, 3L, 2L, 2L, 4L, 4L, 1L, 1L, 3L, 3L), .Label = c("E", "Q", "R", "W"), class = "factor"), MethodType2 = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("A", "B"), class = "factor"), Metric1 = c(0.970017512487058, 0.969647220975651, 0.965873991040769, 0.966242788535318, 0.986725852301671, 0.98696657967457, 0.98252107117733, 0.982655296614757, 0.278826941542694, -0.990926101696033, 0.194574672498287, 0.281916524368647, 0.152983364411985, 1.44135982835554, 0.330270447575806, -0.369627160641594 ), Metric2 = c(0.987541353383459, 0.987007518796992, 0.980984962406015, 0.981646616541353, 0.984082706766917, 0.984481203007519, 0.988165413533835, 0.988375939849624, -0.109331599015822, -0.148471161609603, 1.31331396089969, -1.34238564643737, 2.14014350779371, -0.422879539464588, -1.25706359685425, 1.09603324772565)), row.names = c(NA, -16L), class = "data.frame") and I want to have a table like this: Which kind of manipulation I can use? Which tool I can use. I'm looking for something flexible that can work also with more factors.
Calculate the most consumed food items by Ill Patients
I have a dataset as below, which has many columns. There are some columns whose headings are : baked_hamburgur,spinach,mashed_potato,cabbages,jello,rolls,brown,milk,coffee,water,cakes,vanilla,chocolate,fruitsalad There are other columns as well, but I am only interested as of now in the above columns. the value in each row of these columns is either: yes, or no. A screenshot of this data is as under, as I am not able to attach/share this file in the question itself. The dput(head()) output is as under: > dput(head(illness_data)) structure(list(Age = structure(c(18L, 26L, 22L, 25L, 29L, 13L ), .Label = c("10", "106", "11", "12", "14", "15", "16", "17", "18", "19", "2", "20", "22", "23", "24", "25", "26", "27", "28", "30", "31", "32", "33", "34", "36", "38", "39", "4", "42", "43", "44", "45", "46", "48", "5", "7", "8", "9", "seven"), class = "factor"), sex = structure(c(3L, 2L, 3L, 3L, 2L, 3L), .Label = c("-1", "Female", "Male"), class = "factor"), timesupper = c(2000L, 1830L, 1830L, 1930L, 1930L, 1930L), ill = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "yes", class = "factor"), onsetdate = structure(c(4L, 4L, 4L, 1L, 1L, 4L), .Label = c("18-Apr", "18-Jun", "18/4", "19-Apr"), class = "factor"), onsettime = c(30L, 30L, 30L, 2230L, 2230L, 200L), baked_hamburgur = structure(c(2L, 2L, 2L, 2L, 2L, 1L), .Label = c("no", "yes"), class = "factor"), spinach = structure(c(2L, 2L, 2L, 2L, 2L, 1L), .Label = c("no", "yes"), class = "factor"), mashed_potato = structure(c(2L, 2L, 1L, 1L, 2L, 1L), .Label = c("no", "yes"), class = "factor"), cabbages = structure(c(1L, 2L, 1L, 2L, 1L, 1L), .Label = c("no", "yes"), class = "factor"), jello = structure(c(1L, 1L, 1L, 2L, 2L, 1L), .Label = c("no", "yes"), class = "factor"), rolls = structure(c(2L, 1L, 1L, 1L, 2L, 1L), .Label = c("no", "yes"), class = "factor"), brown = structure(c(1L, 1L, 1L, 1L, 2L, 1L), .Label = c("no", "yes"), class = "factor"), milk = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("no", "yes"), class = "factor"), coffee = structure(c(2L, 2L, 2L, 1L, 2L, 1L), .Label = c("no", "yes"), class = "factor"), water = structure(c(1L, 1L, 1L, 2L, 2L, 1L), .Label = c("no", "yes"), class = "factor"), cakes = structure(c(1L, 1L, 2L, 1L, 1L, 1L), .Label = c("no", "yes"), class = "factor"), vanilla = structure(c(2L, 2L, 2L, 2L, 2L, 2L), .Label = c("no", "yes"), class = "factor"), chocolate = structure(c(1L, 2L, 2L, 1L, 1L, 2L), .Label = c("no", "yes"), class = "factor"), fruitsalad = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("no", "yes", "yes</pre></body></html>Ztext/plain\b\v\035(F]l~Ó_Ý\026R\002\001" ), class = "factor")), .Names = c("Age", "sex", "timesupper", "ill", "onsetdate", "onsettime", "baked_hamburgur", "spinach", "mashed_potato", "cabbages", "jello", "rolls", "brown", "milk", "coffee", "water", "cakes", "vanilla", "chocolate", "fruitsalad" ), row.names = c(NA, 6L), class = "data.frame") A complete dput command output is as under: > dput(illness_data) structure(list(Age = structure(c(18L, 26L, 22L, 25L, 29L, 13L, 36L, 8L, 11L, 7L, 24L, 10L, 8L, 35L, 34L, 6L, 22L, 39L, 12L, 9L, 36L, 17L, 9L, 20L, 37L, 27L, 32L, 30L, 21L, 24L, 3L, 18L, 33L, 16L, 5L, 31L, 28L, 14L, 19L, 38L, 2L, 4L, 23L, 1L, 18L, 15L), .Label = c("10", "106", "11", "12", "14", "15", "16", "17", "18", "19", "2", "20", "22", "23", "24", "25", "26", "27", "28", "30", "31", "32", "33", "34", "36", "38", "39", "4", "42", "43", "44", "45", "46", "48", "5", "7", "8", "9", "seven"), class = "factor"), sex = structure(c(3L, 2L, 3L, 3L, 2L, 3L, 3L, 3L, 2L, 3L, 3L, 2L, 3L, 3L, 2L, 2L, 3L, 3L, 3L, 2L, 2L, 1L, 3L, 3L, 3L, 2L, 2L, 3L, 2L, 3L, 3L, 3L, 2L, 3L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 2L, 3L), .Label = c("-1", "Female", "Male" ), class = "factor"), timesupper = c(2000L, 1830L, 1830L, 1930L, 1930L, 1930L, 2200L, 1900L, 1930L, NA, NA, NA, NA, 2200L, NA, NA, NA, 2200L, NA, NA, 2200L, 2200L, NA, NA, 2200L, NA, NA, NA, NA, NA, 1900L, NA, 1100L, NA, NA, NA, 2200L, 1930L, 1930L, 2200L, NA, NA, 1930L, 1930L, NA, NA), ill = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L ), .Label = "yes", class = "factor"), onsetdate = structure(c(4L, 4L, 4L, 1L, 1L, 4L, 4L, 2L, 4L, 4L, 4L, 1L, 1L, 4L, 1L, 3L, 1L, 4L, 1L, 1L, 4L, 4L, 1L, 1L, 4L, 1L, 4L, 4L, 1L, 4L, 4L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 1L, 4L, 4L, 4L, 4L, 1L, 4L, 1L ), .Label = c("18-Apr", "18-Jun", "18/4", "19-Apr"), class = "factor"), onsettime = c(30L, 30L, 30L, 2230L, 2230L, 200L, 100L, 2300L, 200L, 1030L, 30L, 2215L, 2200L, 100L, 2300L, 2145L, 2145L, 100L, 2300L, 2100L, 100L, 100L, 2115L, 2330L, 100L, 2130L, 230L, 200L, 2130L, 30L, 100L, 2230L, 1500L, 2400L, 2300L, 2230L, 100L, 230L, 2330L, 100L, 30L, 30L, 100L, 2400L, 215L, 2300L), baked_hamburgur = structure(c(2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L), .Label = c("no", "yes"), class = "factor"), spinach = structure(c(2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L), .Label = c("no", "yes"), class = "factor"), mashed_potato = structure(c(2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L ), .Label = c("no", "yes"), class = "factor"), cabbages = structure(c(1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L ), .Label = c("no", "yes"), class = "factor"), jello = structure(c(1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L ), .Label = c("no", "yes"), class = "factor"), rolls = structure(c(2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L ), .Label = c("no", "yes"), class = "factor"), brown = structure(c(1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L ), .Label = c("no", "yes"), class = "factor"), milk = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L ), .Label = c("no", "yes"), class = "factor"), coffee = structure(c(2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L ), .Label = c("no", "yes"), class = "factor"), water = structure(c(1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L ), .Label = c("no", "yes"), class = "factor"), cakes = structure(c(1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L ), .Label = c("no", "yes"), class = "factor"), vanilla = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L ), .Label = c("no", "yes"), class = "factor"), chocolate = structure(c(1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, NA, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L ), .Label = c("no", "yes"), class = "factor"), fruitsalad = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L ), .Label = c("no", "yes", "yes</pre></body></html>Ztext/plain\b\v\035(F]l~Ó_Ý\026R\002\001" ), class = "factor")), .Names = c("Age", "sex", "timesupper", "ill", "onsetdate", "onsettime", "baked_hamburgur", "spinach", "mashed_potato", "cabbages", "jello", "rolls", "brown", "milk", "coffee", "water", "cakes", "vanilla", "chocolate", "fruitsalad" ), class = "data.frame", row.names = c(NA, -46L)) R has correctly read these columns as Factor type variable(s). Now, each of these columns correspond to what every ill patient in a hospital has consumed. I'd like to know the most consumed food item by ill patients, using R. Kindly advise a good way to do so. Thanks! Note, I have not tried any other options, than the ones mentioned in this URL below. However, I could not make it work. Count Factor Columns Using R
Since all the rows have ill = 'yes' we can count in each column the number of values with yes in it. A base R approach could be : head(sort(colSums(illness_data[7:20] == "yes"), decreasing = TRUE), 5) # vanilla baked_hamburgur cakes spinach mashed_potato # 43 29 27 26 23 I have selected columns 7 to 20 because those are the only columns where food item is present. Also I have selected only top 5 values, you can select any value by changing the number 5 in head command.
I am not completely sure what you are looking for, but this will calculate how often foods are consumed (using the tidyverse package): library(tidyverse) illness_data_summed <- illness_data %>% mutate_at(vars(-Age, -sex, -timesupper,-onsetdate,-onsettime), ~ifelse(. == "yes", 1,0)) %>% summarise_at(vars(-Age, -sex, -timesupper,-onsetdate,-onsettime, -ill), ~sum(., na.rm = TRUE)) illness_data_summed[which(illness_data_summed == max(illness_data_summed))] So first I convert the yes into 1 and no into 0, which makes the sum a representation of the number of times the specific food was eaten. I do it for all columns except those you are not interested in (indicated by the - in vars) but you can also reverse that if that is desirable (e.g., when the number of vars yo uwant to convert is lower than those you do not want to convert). The last part will result in: vanilla 1 43
Getting percentages out of a list of dataframes in R
I am very new to R (a few months experience from online learning and reading) and have no coding experience before this. I have been using a data set obtained from work (healthcare) for some practice. I wanted to demonstrate certain patient outcomes over time (by month) in this data set. I've separated the data by month into a separate data frames that I have stored in a list. I then narrowed down each data frame within the list to the 3 post-operative outcomes that I want to look at. All three outcomes are binary (Y or N). I would like to know if there is anyway I can work out the percentages of "Y" for each of these outcomes by month, and then store this in an object that I can then plot to show the trend over time (by month). Have I approached this problem completely wrongly? Should I not have used a list at all? I managed to get to a point where I have a list of tables of Y's and N's but am now completely clueless as to what to do from there. list(structure(list(Mobilised_D1 = structure(c(2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L), .Label = c("N", "Y"), class = "factor"), Catheter_rm_D1 = structure(c(2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L), .Label = c("N", "Y"), class = "factor"), Diet_D1 = structure(c(2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("N", "Y"), class = "factor")), class = "data.frame", row.names = 2:15), structure(list(Mobilised_D1 = structure(c(1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("N", "Y"), class = "factor"), Catheter_rm_D1 = structure(c(1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L ), .Label = c("N", "Y"), class = "factor"), Diet_D1 = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L ), .Label = c("N", "Y"), class = "factor")), class = "data.frame", row.names = 16:31), structure(list(Mobilised_D1 = structure(c(2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L), .Label = c("N", "Y"), class = "factor"), Catheter_rm_D1 = structure(c(1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L), .Label = c("N", "Y"), class = "factor"), Diet_D1 = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("N", "Y"), class = "factor")), class = "data.frame", row.names = 32:42), structure(list(Mobilised_D1 = structure(c(2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("N", "Y"), class = "factor"), Catheter_rm_D1 = structure(c(2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("N", "Y"), class = "factor"), Diet_D1 = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("N", "Y"), class = "factor")), class = "data.frame", row.names = 43:60), structure(list(Mobilised_D1 = structure(c(1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, NA, 2L, 1L, 1L, 2L, NA), .Label = c("N", "Y"), class = "factor"), Catheter_rm_D1 = structure(c(1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("N", "Y"), class = "factor"), Diet_D1 = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("N", "Y"), class = "factor")), class = "data.frame", row.names = 61:74), structure(list(Mobilised_D1 = structure(c(1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L), .Label = c("N", "Y"), class = "factor"), Catheter_rm_D1 = structure(c(1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L ), .Label = c("N", "Y"), class = "factor"), Diet_D1 = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L ), .Label = c("N", "Y"), class = "factor")), class = "data.frame", row.names = 75:90))
For each component of the input list, L, take the indicated mean arranging that into a multivariate time series with one row per month. Then plot it on a single panel. Remove facet=NULL if you want each series in a separate panel. library(zoo) library(ggplot2) series <- zoo( t(sapply(L, function(x) colMeans(x == "Y"))) ) autoplot(series, facet = NULL) + geom_point() (continued after graph) Alternative An alternative is to create a data frame DF from L along with a month vector aggregating by month as shown. This makes use of the fact that DF will have row names consisting of the month followed by a decimal point and a row number from the original component that each input row was was constructed from. DF <- do.call("rbind", setNames(L, seq_along(L))) month <- as.integer(rownames(DF)) series <- aggregate(zoo(DF == "Y"), month, mean) autoplot(series, facet = NULL) + geom_point()
How make labels of plot clear
I did a plot explaining occurrences of each modality for many variables. It is about clustering problem to show which variables are explaining each cluster. So > dput(DATA1[1:20,]) structure(list(TYPE_PEAU = structure(c(1L, 2L, 1L, 3L, 1L, 2L, 1L, 2L, 2L, 2L, 3L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L), .Label = c("Sèche", "Mixte", "Normale", "Grasse"), class = "factor"), SENSIBILITE = structure(c(2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 1L, 2L, 2L, 3L, 3L, 3L, 1L, 3L, 1L), .Label = c("Aucune", "Fréquente", "Occasionnelle" ), class = "factor"), IMPERFECTIONS = structure(c(2L, 2L, 3L, 2L, 3L, 1L, 2L, 2L, 1L, 2L, 2L, 3L, 2L, 1L, 2L, 2L, 2L, 3L, 2L, 3L), .Label = c("Fréquente", "Occasionnelle", "Rares"), class = "factor"), BRILLANCE = structure(c(3L, 3L, 1L, 1L, 1L, 2L, 1L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 1L, 1L, 3L, 3L), .Label = c("Aucune", "Partout", "Zone T"), class = "factor"), GRAIN_PEAU = structure(c(1L, 2L, 1L, 2L, 2L, 3L, 3L, 3L, 3L, 2L, 1L, 2L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 1L), .Label = c("Fin", "Moyen", "Dilaté"), class = "factor"), RIDES_VISAGE = structure(c(3L, 3L, 3L, 3L, 3L, 1L, 1L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 3L, 3L, 3L), .Label = c("Aucune", "Très visibles", "Visibles"), class = "factor"), ALLERGIES = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Non", "Oui"), class = "factor"), MAINS = structure(c(1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 3L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L), .Label = c("Sèches", "Normales", "Moites"), class = "factor"), PEAU_CORPS = structure(c(3L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 3L, 1L, 2L, 2L, 2L, 3L, 2L, 3L, 2L, 3L, 3L, 1L), .Label = c("Normale", "Sèche", "Très sèche" ), class = "factor"), INTERET_ALIM_NATURELLE = structure(c(3L, 1L, 1L, 1L, 1L, 2L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Beaucoup", "Pas du tout", "Un peu" ), class = "factor"), INTERET_ORIGINE_GEO = structure(c(3L, 1L, 1L, 1L, 3L, 2L, 3L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 1L, 1L, 3L, 1L, 1L, 1L), .Label = c("Beaucoup", "Pas du tout", "Un peu" ), class = "factor"), INTERET_VACANCES = structure(c(1L, 1L, 2L, 2L, 3L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L), .Label = c("À la mer", "À la montagne", "En ville"), class = "factor"), INTERET_ENVIRONNEMENT = structure(c(3L, 1L, 1L, 1L, 3L, 2L, 3L, 3L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Beaucoup", "Pas du tout", "Un peu" ), class = "factor"), INTERET_COMPOSITION = structure(c(3L, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Beaucoup", "Pas du tout", "Un peu" ), class = "factor"), PRIORITE_1 = structure(c(1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 1L, 1L, 1L, 1L, 3L, 2L, 2L, 3L, 1L), .Label = c("éclatante", "hydratée", "lisse", "matifiée", "nourrie", "purifiée", "reposée"), class = "factor"), MILIEU_RESIDENCE = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L), .Label = c("nature", "urbain"), class = "factor")), .Names = c("TYPE_PEAU", "SENSIBILITE", "IMPERFECTIONS", "BRILLANCE", "GRAIN_PEAU", "RIDES_VISAGE", "ALLERGIES", "MAINS", "PEAU_CORPS", "INTERET_ALIM_NATURELLE", "INTERET_ORIGINE_GEO", "INTERET_VACANCES", "INTERET_ENVIRONNEMENT", "INTERET_COMPOSITION", "PRIORITE_1", "MILIEU_RESIDENCE"), row.names = c(1L, 2L, 11L, 13L, 15L, 16L, 17L, 20L, 23L, 32L, 33L, 34L, 37L, 38L, 39L, 40L, 42L, 43L, 45L, 48L), class = "data.frame") Then I use this code: library(tidyverse) DATA1 %>% gather(k, v) %>% ggplot(aes(v)) + geom_bar(fill = "orange", width = 0.7) + coord_flip() + facet_wrap(~k) Then I get as result this plot: But as you can see lebels in the vertical axis are not clear !! please how can I resolve this issue??
You can try to resize the text: DATA1 %>% gather(k, v) %>% ggplot(aes(v)) + geom_bar(fill = "orange", width = 0.7) + theme(axis.text.y = element_text(face="bold", color="black", size=4)) + coord_flip() + facet_wrap(~k) And/or abbreviate the labels with: + scale_x_discrete(labels = abbreviate)