Data set structure and NA values - r

I've successfully rearranged the dataset into the format I want (see code annotation). However:
(a) I feel that there's a cleaner more efficient way to construct the database, maybe in the Tidyverse? My solution reads as a bit hacked together, and it takes a lot of code to implement. I'd really like to find an elegant and efficient way to do this, but need help.
(b) I am having trouble with the NA values. No matter what I do, R reads them as characters. I don't know if this is crucial for analysis as when I pass the as.numeric() function, it coerces these to NA values anyways. However, I'd like to understand what I'm doing wrong, and how to 'do it right' going forward.
I've provided the code I'm using below, annotated, and dput() 5 lines from my (very large) data set are at the end of the code block to help re-create. Any help/feedback would be much appreciated. Thank you.
library(tidyverse)
# Load data set. This is how I'm loading the data. While the dput() output is at the
# bottom of the code block, I've included this so people can see my steps for input, and
# how I'm inputting NA's.
pheno_sep_imp <- read.table(file="~/pheno_sep_imp.txt",
row.names = 1, header = TRUE,
na.strings = c(NA, "NA", " NA"), sep ="\t")
pheno_sep_imp <- data.frame(pheno_sep_imp,stringsAsFactors = T)
pheno_sep_imp <- mutate_if(pheno_sep_imp, is.integer, as.factor)
pheno_sep_imp <- mutate_if(pheno_sep_imp, is.character, as.factor)
# Remove anterior teeth
pheno_sep_imp <- pheno_sep_imp[c(1:6,22:46,62:86)]
# Re-code R3En to 3 for analysis
pheno_sep_imp[pheno_sep_imp == "R3En"] <- 3
# Re-code CON to NA for analysis. This is the point where NA's become a problem as they're # converting to 'character'.
pheno_sep_imp[pheno_sep_imp == "CON"] <- NA
# Create an empty data frame for transformed data set
dta <- data.frame(matrix(vector(), 0, 8,
dimnames = list(c(),
c("Ind", "Geo",
"E1", "E2", "E3", "E4", "E5",
"Tooth"))),
stringsAsFactors=FALSE)
# Select names of columns from original data set for teeth/elements
nms <- names(pheno_sep_imp)
str_nms <- strsplit(nms, "_")
tooth_names <- NULL
for(i in 1:length(str_nms)){
if(i>1){
tooth_names <- c(tooth_names, paste0(str_nms[[i]][1]))
}
}
# variable locations in pheno_sep_imp, put into dta
root_num <- seq(7, 52, by=5)
count = 0
for(i in 1:dim(pheno_sep_imp)[1]){
tmp <- pheno_sep_imp[i,]
ind <- paste0("ind_",i)
for(k in root_num){
count <- count + 1
dta[count,] <- c(ind,toString(tmp[2][1,1]),
toString(tmp[k][[1]]),
toString(tmp[k+1][[1]]),
toString((tmp[k+2][[1]])),
toString((tmp[k+3][[1]])),
toString((tmp[k+4][[1]])),
tooth_names[k-1])
}
}
# check structure of data set
str(dta)
# check to see if NA's are NA's or 'characters'
class(dta$E1[1])
dta$E1 <- as.numeric(dta$E1) # Warning message: NAs introduced by coercion
dta$E2 <- as.numeric(dta$E2) # Warning message: NAs introduced by coercion
dta$E3 <- as.factor(dta$E3) # works as it should
dta$E4 <- as.factor(dta$E4) # works as it should
dta$E5 <- as.factor(dta$E5) # works as it should
dta$Geo <- as.factor(dta$Geo) # works as it should
dta$Ind <- as.factor(dta$Ind) # works as it should
dta <- dta[complete.cases(dta),]
View(dta) # Data is in the format I want/need for my analysis.
# dput() of 5 lines from my data set:
structure(list(Sex = structure(c(2L, 2L, 2L, 1L, NA, 1L), .Label = c("Female",
"Male"), class = "factor"), G1_Major_Human_Subdivisions = structure(c(2L,
2L, 2L, 2L, 2L, 2L), .Label = c("Sahul_Pacific", "Sino_Americas",
"Sub_Saharan_Africa", "Sunda_Pacific", "West_Eurasia"), class = "factor"),
G2_Continental_Group = structure(c(4L, 4L, 4L, 4L, 4L, 4L
), .Label = c("Central_America", "Europe", "North_Africa",
"North_America", "Oceania", "South_America", "South_Asia",
"South_East_Asia", "Sub_Saharan_Africa"), class = "factor"),
G3_Continental_Region = structure(c(1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("American_Arctic", "Andaman_Archipelago", "Andean",
"Australia", "Carribbean", "Central_Africa", "Central_America",
"Central_Europe", "Eastern_Africa", "Eastern_Europe", "Indian_Sub_Continent",
"Indochinese_Peninsula", "Malay_Archipelago", "Melanesia",
"NA_Northeast_Woodlands", "NA_Northwest_Coast", "NA_Plains",
"NA_South_West", "NA_Subarctic", "NA_Unknown", "North_East_Africa",
"North_Western_Africa", "Northern_Africa", "Northern_Europe",
"Polynesia", "SA_Unknown", "Southern_Africa", "Southern_Europe",
"SSA_Unknown", "Western_Africa", "Western_Europe"), class = "factor"),
G4_Country_State = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Alaska",
"American_Arctic_Unknown", "Andaman_Island", "Angola", "Argentina",
"Australia_Unknown", "Austrian", "Bangladesh", "Barbados",
"Canada", "Canary_Islands", "Central_Australia", "Chile",
"Congo", "Czechoslovakia", "East_India", "Egypt", "England",
"Finland", "France", "Germany", "Ghana", "Greece", "Greenland",
"Guatemala", "Guinea", "Hungary", "India", "India_Unknown",
"Indonesia", "Italy", "Jamaica", "Kenya", "Malta", "Mozambique",
"Myanmar", "NA_Unknown", "Namibia", "New_South_Wales", "New_Zealand",
"Nicobar_Island", "Nigeria", "North_India", "Northern_Territory",
"Pakistan", "Papua_New_Guinea", "Peru", "Philippines", "Queensland",
"Russia", "SA_Unknown", "Solomon_Islands", "Somalia", "South_Africa",
"South_Australia", "South_East_Australia", "South_India",
"Spain", "Sri_Lanka", "SSA_Unknown", "Sudan", "Sweden", "Switzerland",
"Tanzania", "Uganda", "Ukraine", "United_States", "Victoria",
"West_India", "Western_Australia", "Zimbabwe"), class = "factor"),
G5_Locality_Tribe = structure(c(57L, 57L, 57L, 57L, 57L,
57L), .Label = c("Aborigine", "Aboringine", "Ainaho", "Ainaho_Burao",
"Akamba", "Ali_Kush", "Amaponda", "Amaxhosa_Great_Winterberg",
"Apache", "Arawak", "Ashanti", "Badari", "Baffin_Island",
"Baiono", "Ballam_Coffa", "Bambuti_Pygmy", "Bantu_Kaoisoudo",
"Basuto", "Bechuanaland", "Bengal", "Bengal_Bangladesh",
"Berida", "Bihari", "Bingemma", "Brazaville", "Brittany",
"Bukoba", "Cape_Spencer_Aborigine", "Colombo", "Coorg", "Crime_Sebastopol",
"Crocodile _Island_Yan_nhanu", "Croydon_Queensland", "Darood",
"Darood_Hawiya", "Deccan_Berars", "Derby_Coast_Aborigine",
"Didali", "Dravidian", "Eingenadu", "Fanti", "Gannawarri",
"Gondaiaio", "Graubunden_Saint_Moritz", "Guanche", "Hadad",
"Halle", "Hariya", "Haya", "Hexham", "Hindu", "Hindustan",
"Hindustan_Bihar", "Huron", "Inuit", "Inuit_Eleanoran_Bay",
"Ipiutak", "Iroquois", "Java", "Jebel_Moya", "Jilili", "Kaduna",
"Kagoro", "Kalahari", "Kerma", "Ketchipawan", "Khanty_Kondinski",
"Khoikhoi", "Kikuyu", "Knysna_Cave", "Korana", "Kwaiawata _Island_Muyuw",
"Lapland", "Lazio", "Loddon_River_Aborigine", "Luanda", "Mackay_Aborigine",
"Makah", "Makua", "Malaysian", "Manatee_Cradock", "Manitoba",
"Maori", "Mem_Mem", "Minorca", "Mortlake_Aborigine", "Mumbai_Parsi",
"Muri_Province", "Murray_River", "Murray_River_Aborigine",
"Murua_Island_Muyuw", "Muyuw_Kwaiawata_Island", "Nagada",
"Naharhmpikya_Sinhalese", "Native_American", "New_Britain",
"New_Westminster", "Newcastle", "North_Queensland ", "Oriomo_River_Daudai",
"Paestum", "Pagi_Island", "Pakistan", "Paliyan_Tribe", "Paris",
"Pasamayo", "Pathan", "Patna", "Perth", "Plympton_Aborigine",
"Port_Elizabeth", "Punjab", "Rio_Gallegos", "Rome", "Salekhard",
"Sardinia", "Sicily", "Sioux", "South_Wilshire", "St_Bernard",
"Swanport_Aborigine", "Tagalog_Island", "Tal_Horr", "Tegera_Well",
"Teita", "Teso", "Thessaly", "Tigara", "Toszeg", "Upper_Congo_River",
"Valparaiso", "Vancouver_Island", "Veddah", "Vienna", "Walvis_Bay",
"Wollongong", "Wynberg_San", "Yola", "Yoruba_Ilorin", "Zuni"
), class = "factor"), RI1_MAX_E1 = structure(c(1L, 1L, 1L,
1L, 1L, 1L), .Label = "1", class = "factor"), RI1_MAX_E2 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "1", class = "factor"), RI1_MAX_E3 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "A", class = "factor"), RI1_MAX_E4 = structure(c(2L,
2L, 2L, 2L, 2L, 2L), .Label = c("E", "G", "P", "W"), class = "factor"),
RI1_MAX_E5 = structure(c(2L, 2L, 2L, 2L, 2L, 2L), .Label = c("O",
"R"), class = "factor"), RI2_MAX_E1 = structure(c(1L, 1L,
1L, 1L, 1L, 1L), .Label = c("1", "CON"), class = "factor"),
RI2_MAX_E2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2", "CON"), class = "factor"), RI2_MAX_E3 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("A", "B1L1", "CON"), class = "factor"),
RI2_MAX_E4 = structure(c(3L, 2L, 3L, 3L, 2L, 3L), .Label = c("CON",
"E", "G", "P", "W"), class = "factor"), RI2_MAX_E5 = structure(c(3L,
3L, 3L, 3L, 3L, 3L), .Label = c("CON", "O", "R", "R4"), class = "factor"),
RC1_MAX_E1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "1", class = "factor"),
RC1_MAX_E2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2"), class = "factor"), RC1_MAX_E3 = structure(c(1L, 1L,
1L, 1L, 1L, 1L), .Label = c("A", "B1L1"), class = "factor"),
RC1_MAX_E4 = structure(c(4L, 1L, 1L, 5L, 5L, 5L), .Label = c("E",
"EBi", "G", "P", "W"), class = "factor"), RC1_MAX_E5 = structure(c(1L,
1L, 2L, 1L, 1L, 1L), .Label = c("O", "R", "R5"), class = "factor"),
RP3_MAX_E1 = structure(c(2L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2", "3"), class = "factor"), RP3_MAX_E2 = structure(c(2L,
2L, 1L, 1L, 2L, 2L), .Label = c("1", "2", "3"), class = "factor"),
RP3_MAX_E3 = structure(c(2L, 2L, 1L, 1L, 2L, 2L), .Label = c("A",
"B1L1", "B1L2", "B2L1", "M1D1", "M1D1L1"), class = "factor"),
RP3_MAX_E4 = structure(c(1L, 17L, 17L, 17L, 17L, 18L), .Label = c("BGLG",
"BGLKBi", "BHLE", "BHLG", "BKLG", "BPLG", "BWLG", "E", "H",
"HBi", "K", "KBi", "MEDGLG", "MGDGLE", "MGDGLG", "MPDPLE",
"P", "PBi", "W"), class = "factor"), RP3_MAX_E5 = structure(c(3L,
11L, 8L, 8L, 3L, 10L), .Label = c("BR2LR", "BR4LR", "BRLR",
"BRLR2", "i2", "i5", "MRDRLR", "O", "R", "R2", "R4", "R5"
), class = "factor"), RP4_MAX_E1 = structure(c(1L, 1L, 1L,
1L, 1L, 1L), .Label = c("1", "2", "3"), class = "factor"),
RP4_MAX_E2 = structure(c(2L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2", "3", "4"), class = "factor"), RP4_MAX_E3 = structure(c(2L,
1L, 1L, 1L, 1L, 1L), .Label = c("A", "B1L1", "B2L1", "B2L2",
"M1D1L1"), class = "factor"), RP4_MAX_E4 = structure(c(13L,
13L, 13L, 13L, 13L, 13L), .Label = c("BELG", "BGLG", "BHLG",
"BKLG", "E", "G", "H", "HBi", "K", "KBi", "MGDGLE", "MLFBiDG",
"P", "PBi", "W"), class = "factor"), RP4_MAX_E5 = structure(c(10L,
7L, 7L, 7L, 7L, 7L), .Label = c("BR2L4", "BR2LR", "BRLR",
"i2", "i5", "MRDRLR", "O", "R", "R2", "R4", "R5"), class = "factor"),
RM1_MAX_E1 = structure(c(3L, 3L, 3L, 3L, 3L, 2L), .Label = c("1",
"2", "3", "4"), class = "factor"), RM1_MAX_E2 = structure(c(3L,
2L, 2L, 2L, 2L, 2L), .Label = c("2", "3", "4", "5", "6"), class = "factor"),
RM1_MAX_E3 = structure(c(9L, 3L, 3L, 3L, 3L, 3L), .Label = c("B1L1",
"M1D1", "M1D1L1", "M1D1L2", "M1D2", "M1D2L1", "M1L1", "M2D1",
"M2D1L1", "M2D1L2", "M2D2L1", "M2D2L2", "M3D1L1", "MB1DB1ML1DL1"
), class = "factor"), RM1_MAX_E4 = structure(c(51L, 32L,
32L, 45L, 32L, 38L), .Label = c("BKLG", "BPLG", "MBPDBEMLEDLG",
"MDFLE", "MDFLG", "MDFLK", "MDFLP", "MEDELE", "MEDELP", "MEDGLG",
"MEDKLE", "MEDPLE", "MEDWLK", "MHDELP", "MHDPLE", "MHDPLP",
"MKDELE", "MKDELP", "MKDGLE", "MKDGLG", "MKDGLP", "MKDPLE",
"MKDPLG", "MKDPLP", "MLFBiDG", "MLFBiDP", "MLFDE", "MPBiDPLG",
"MPBiDPLK", "MPBiDPLP", "MPDELE", "MPDELG", "MPDELK", "MPDELP",
"MPDGLE", "MPDGLG", "MPDGLP", "MPDLF", "MPDLFBi", "MPDP",
"MPDPLE", "MPDPLG", "MPDPLK", "MPDPLP", "MPDWLE", "MPDWLG",
"MPDWLP", "MWBiDPLP", "MWBiDWLG", "MWDE", "MWDELE", "MWDELG",
"MWDELK", "MWDELP", "MWDGLE", "MWDGLG", "MWDGLK", "MWDGLP",
"MWDGLR", "MWDGLW", "MWDKLE", "MWDKLG", "MWDKLP", "MWDLF",
"MWDLFBi", "MWDPDE", "MWDPLE", "MWDPLG", "MWDPLK", "MWDPLP",
"MWDPLW", "MWDWLE", "MWDWLG", "MWDWLK", "MWDWLP", "P"), class = "factor"),
RM1_MAX_E5 = structure(c(29L, 42L, 22L, 22L, 20L, 22L), .Label = c("BOLR",
"MBODBOMLRDLR", "Mi2DOLO", "Mi2DOLR", "Mi2DR4LR4", "Mi2DRLO",
"Mi2DRLR", "Mi2DRLR2", "Mi3DOLR", "Mi3DRLO", "Mi3DRLR", "Mi4DRLR",
"Mi5DOLO", "Mi5DOLR", "Mi5DRLO", "Mi5DRLR", "MODi2", "MODLi5",
"MODOLO", "MODOLR", "MODRLO", "MODRLR", "MODRLR2", "MR2DO",
"MR2DOLO", "MR2DOLR", "MR2DR2LO", "MR2DRLO", "MR2DRLR", "MR2DRLR2",
"MR4DOLO", "MR4DOLR", "MR4DRLO", "MR4DRLR", "MR5DOLO", "MR5DRLO",
"MR5DRLR", "MRDOLO", "MRDOLR", "MRDRLi5", "MRDRLO", "MRDRLR",
"R2"), class = "factor"), RM2_MAX_E1 = structure(c(1L, 1L,
1L, 1L, 3L, 3L), .Label = c("1", "2", "3", "4"), class = "factor"),
RM2_MAX_E2 = structure(c(3L, 3L, 3L, 3L, 3L, 3L), .Label = c("1",
"2", "3", "4"), class = "factor"), RM2_MAX_E3 = structure(c(5L,
5L, 5L, 5L, 5L, 5L), .Label = c("A", "B1L1", "M1B1D1L1",
"M1D1", "M1D1L1", "M1D2L1", "M2D1L1", "MB1DB1ML1DL1", "ML3D1"
), class = "factor"), RM2_MAX_E4 = structure(c(45L, 13L,
13L, 51L, 57L, 70L), .Label = c("BGLG", "BHLG", "BKLG", "BKLK",
"BKLP", "BLF", "BPLG", "H", "HBi", "KBi", "MBEDBEMLEDLE",
"MBWDBGMLEDLE", "MDFDLF", "MDFLE", "MDFLG", "MDFMLF", "MDFMLFBi",
"MEDELE", "MEDELG", "MEDGLE", "MEDGLG", "MEDGLK", "MEDKLG",
"MEDWLG", "MGDGLG", "MHBiDELE", "MHDELE", "MHDELG", "MHDELP",
"MHDGLG", "MHDGLP", "MHDWLP", "MKDELE", "MKDELG", "MKDELP",
"MKDGLE", "MKDGLG", "MKDGLP", "MKDLF", "MKDPLE", "MKDPLP",
"MLFBGDLF", "MLFBiDE", "MLFBiDG", "MLFBiDLF", "MLFBiDP",
"MLFBiDW", "MLFBiMDF", "MLFDE", "MLFDG", "MLFDLF", "MLFDLFBi",
"MLFDP", "MLFDW", "MLFMDF", "MPDELE", "MPDELG", "MPDELK",
"MPDELP", "MPDGLE", "MPDGLG", "MPDGLP", "MPDKLG", "MPDLF",
"MPDPLE", "MPDPLG", "MPDPLP", "MPDWLG", "MWDELE", "MWDELG",
"MWDELK", "MWDELP", "MWDGLE", "MWDGLG", "MWDGLK", "MWDGLP",
"MWDKLE", "MWDKLG", "MWDLF", "MWDLFBi", "MWDPLE", "MWDPLG",
"MWDPLK", "MWDPLP", "MWDWLE", "MWDWLG", "MWDWLP", "P", "W"
), class = "factor"), RM2_MAX_E5 = structure(c(28L, 16L,
20L, 16L, 25L, 44L), .Label = c("BOLO", "BOLR", "BRLR", "i2",
"i3", "MBRDBRMLRDLR", "MDi2LO", "MDi2LR", "Mi2DOLR", "Mi2DRLO",
"Mi2DRLR", "Mi3DRLO", "Mi5DOLR", "Mi5DRLR", "MLi2DO", "MLi2DR",
"MLi3DR", "MLi4DO", "MLi5DO", "MLi5DR", "MLODLi2", "MODi5LR",
"MODLi2", "MODOLO", "MODOLR", "MODR4LR", "MODRLO", "MODRLR",
"MR2DOLO", "MR2DOLR", "MR2DRLO", "MR2DRLR", "MR4DOLO", "MR4DOLR",
"MR4DRLO", "MR4DRLR", "MR5DRLO", "MR5DRLR", "MRBRDRLR", "MRDLi2",
"MRDOLO", "MRDOLR", "MRDRLO", "MRDRLR", "O", "R", "R2"), class = "factor"),
RM3_MAX_E1 = structure(c(3L, 1L, 3L, 1L, 2L, 1L), .Label = c("1",
"2", "3", "4", "CON"), class = "factor"), RM3_MAX_E2 = structure(c(3L,
3L, 3L, 3L, 3L, 3L), .Label = c("1", "2", "3", "4", "CON"
), class = "factor"), RM3_MAX_E3 = structure(c(7L, 7L, 7L,
7L, 7L, 7L), .Label = c("A", "B1L1", "B2D1L1", "CON", "M1B1D1L1",
"M1D1", "M1D1L1", "M1D1L2", "M1D2", "M1D2L1", "M2D1", "M2D1L1",
"M2D2", "MB1DB1ML1DL1", "ML3D1"), class = "factor"), RM3_MAX_E4 = structure(c(70L,
49L, 70L, 49L, 61L, 49L), .Label = c("BGLG", "BGLK", "BKDGLG",
"BKLG", "CON", "E", "G", "HBi", "K", "MBEDBGMLEDLG", "MBEDBGMLGDLG",
"MBGDBGMLGDLG", "MDF", "MDFDLF", "MDFLE", "MDFLG", "MEDELE",
"MEDELP", "MEDGLE", "MEDGLG", "MEDLF", "MEDP", "MEDWLE",
"MGDELG", "MGDGLG", "MGDLF", "MHBiDELG", "MHBiDH", "MHBiDHBi",
"MHDGLE", "MHDGLG", "MHDH", "MHDK", "Mi", "MKDELE", "MKDELG",
"MKDGLE", "MKDGLG", "MKDGLP", "MKDKLG", "MKDLF", "MKDP",
"MKDPLG", "MLFBiDE", "MLFBiDG", "MLFBiDLF", "MLFDE", "MLFDG",
"MLFDLF", "MLFDP", "MLFMDF", "MPBiDGLG", "MPBiDW", "MPDELE",
"MPDELG", "MPDGLE", "MPDGLG", "MPDGLP", "MPDGLPBi", "MPDH",
"MPDLF", "MPDP", "MPDPBi", "MPDPLG", "MWBEDELG", "MWDELE",
"MWDELG", "MWDELP", "MWDGLE", "MWDGLG", "MWDGLP", "MWDH",
"MWDKLG", "MWDLF", "MWDPLG", "MWDW", "MWDWLG", "P"), class = "factor"),
RM3_MAX_E5 = structure(c(45L, 45L, 45L, 17L, 29L, 45L), .Label = c("BOLR",
"BR2DRLR", "BRLO", "BRLR", "CON", "i2", "i3", "i5", "MBRBDRMLRDLR",
"MBRDBRMLRDLR", "MDi2LR", "Mi2DOLR", "Mi2DR", "Mi2DRLR",
"Mi3DRLR", "Mi5DRLR", "MLi2DR", "MLi4DR", "MODi5LR", "MODLi2",
"MODLi3", "MODLi5", "MODO", "MODOLO", "MODOLR", "MODR", "MODR4",
"MODRLO", "MODRLR", "MODRLR2", "MR2DOLO", "MR2DR2", "MR2DRLO",
"MR2DRLR", "MR4DR2", "MR4DR4", "MR4DRLO", "MR4DRLR", "MRBRDRLR",
"MRDLi5", "MRDOLR", "MRDR", "MRDR2", "MRDRLO", "MRDRLR",
"O", "R", "R2", "R4"), class = "factor"), ri1_mand_E1 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "1", class = "factor"), ri1_mand_E2 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("1", "2"), class = "factor"),
ri1_mand_E3 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("A",
"B1L1"), class = "factor"), ri1_mand_E4 = structure(c(4L,
4L, 4L, 4L, 4L, 4L), .Label = c("E", "G", "K", "P", "W"), class = "factor"),
ri1_mand_E5 = structure(c(3L, 3L, 2L, 2L, 2L, 2L), .Label = c("i2",
"O", "R", "R2", "R4"), class = "factor"), ri2_mand_E1 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("1", "CON"), class = "factor"),
ri2_mand_E2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2", "CON"), class = "factor"), ri2_mand_E3 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("A", "B1L1", "CON"), class = "factor"),
ri2_mand_E4 = structure(c(5L, 5L, 5L, 5L, 5L, 5L), .Label = c("CON",
"E", "H", "K", "P", "W"), class = "factor"), ri2_mand_E5 = structure(c(5L,
5L, 4L, 4L, 4L, 4L), .Label = c("CON", "i2", "i5", "O", "R",
"R2", "R4"), class = "factor"), rc1_mand_E1 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("1", "2"), class = "factor"),
rc1_mand_E2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2"), class = "factor"), rc1_mand_E3 = structure(c(1L, 1L,
1L, 1L, 1L, 1L), .Label = c("A", "B1L1"), class = "factor"),
rc1_mand_E4 = structure(c(2L, 6L, 2L, 7L, 6L, 7L), .Label = c("BGLG",
"E", "G", "H", "K", "P", "W", "WBi"), class = "factor"),
rc1_mand_E5 = structure(c(3L, 3L, 3L, 3L, 3L, 3L), .Label = c("BRLR",
"i2", "O", "R", "R2", "R4", "R5"), class = "factor"), rp3_mand_E1 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("1", "2"), class = "factor"),
rp3_mand_E2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2", "3"), class = "factor"), rp3_mand_E3 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("A", "B1L1", "M1D1L1"), class = "factor"),
rp3_mand_E4 = structure(c(7L, 7L, 3L, 10L, 7L, 7L), .Label = c("BGLG",
"BWLG", "E", "G", "H", "K", "P", "T", "TBi", "W"), class = "factor"),
rp3_mand_E5 = structure(c(7L, 8L, 7L, 7L, 7L, 7L), .Label = c("BRLR",
"i2", "i3", "i4", "i5", "MRDRLR", "O", "R", "R2", "R4"), class = "factor"),
rp4_mand_E1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "1", class = "factor"),
rp4_mand_E2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2"), class = "factor"), rp4_mand_E3 = structure(c(1L, 1L,
1L, 1L, 1L, 1L), .Label = c("A", "B1L1"), class = "factor"),
rp4_mand_E4 = structure(c(5L, 1L, 2L, 5L, 5L, 5L), .Label = c("E",
"G", "HBi", "K", "P", "T", "TBi", "W"), class = "factor"),
rp4_mand_E5 = structure(c(2L, 3L, 3L, 2L, 2L, 2L), .Label = c("i5",
"O", "R", "R2", "R4"), class = "factor"), rm1_mand_E1 = structure(c(1L,
2L, 2L, 2L, 1L, 1L), .Label = c("2", "R3En"), class = "factor"),
rm1_mand_E2 = structure(c(2L, 2L, 2L, 2L, 3L, 3L), .Label = c("2",
"3", "4", "5", "6"), class = "factor"), rm1_mand_E3 = structure(c(3L,
2L, 2L, 2L, 3L, 5L), .Label = c("M1D1", "M1D1L1", "M2D1",
"M2D1L1", "M2D2", "M2D2L1", "M2D3", "M3D1", "M3D2", "M3D3"
), class = "factor"), rm1_mand_E4 = structure(c(17L, 36L,
36L, 45L, 40L, 40L), .Label = c("MHBiDELG", "MHBiDH", "MHBiDHBi",
"MHBiDK", "MHBiDKBi", "MHBiDP", "MHBiDPBi", "MHBiDPLE", "MHBiDPLG",
"MHBiDWLG", "MHDE", "MHDELG", "MHDG", "MHDGLG", "MHDH", "MHDK",
"MHDP", "MHDPBi", "MHDPLE", "MHDPLG", "MHDWLG", "MKBiDK",
"MKBiDP", "MKDH", "MKDK", "MKDP", "MPBiDELG", "MPBiDK", "MPBiDKLG",
"MPBiDP", "MPBiDPLE", "MPBiDPLG", "MPDE", "MPDELE", "MPDELG",
"MPDGLG", "MPDK", "MPDKBi", "MPDKLG", "MPDP", "MPDPBi", "MPDPLE",
"MPDPLG", "MPDPLP", "MPDPLW", "MPDWLG", "MPDWLW"), class = "factor"),
rm1_mand_E5 = structure(c(34L, 28L, 28L, 28L, 34L, 39L), .Label = c("Mi2Di2",
"Mi2Di3", "Mi2Di5", "Mi2DO", "Mi2DOLR", "Mi2DR", "Mi2DR2",
"Mi2DRLR", "Mi3Di2", "Mi3Di4", "Mi3DO", "Mi3DOLO", "Mi3DOLR",
"Mi3DRLR", "Mi4Di2", "Mi4Di4", "Mi4DO", "Mi4DR4", "Mi5Di2",
"Mi5Di5", "Mi5DO", "Mi5DOLR", "Mi5DR2", "Mi5DR4", "Mi5DRLR",
"MODO", "MODR", "MODRLR", "MR2Di2", "MR2Di2LR", "MR2Di3",
"MR2Di4", "MR2Di5", "MR2DO", "MR2DOLO", "MR2DOLR", "MR2DR",
"MR2DR2", "MR2DR4", "MR2DR5", "MR2DRLR", "MR4Di2", "MR4Di2LR",
"MR4Di3", "MR4Di4", "MR4DO", "MR4DOLR", "MR4DR", "MR4DR4",
"MR4DRLO", "MR4DRLR", "MR5DOLR", "MR5DR4", "MR5DRLO", "MRDO",
"MRDOLR", "MRDR"), class = "factor"), rm2_mand_E1 = structure(c(1L,
2L, 2L, 1L, 2L, 2L), .Label = c("1", "2", "3", "R3En"), class = "factor"),
rm2_mand_E2 = structure(c(2L, 2L, 2L, 2L, 3L, 3L), .Label = c("1",
"2", "3", "4"), class = "factor"), rm2_mand_E3 = structure(c(6L,
6L, 6L, 6L, 9L, 9L), .Label = c("A", "B1D1L1", "B2L1", "B2L2",
"M1B1D1", "M1D1", "M1D1L1", "M1D2", "M2D1", "M2D1L1", "M2D2",
"M3D1"), class = "factor"), rm2_mand_E4 = structure(c(5L,
39L, 30L, 5L, 21L, 39L), .Label = c("BGDPLG", "BHBiLP", "BPLK",
"BPLP", "Cs", "CsBi", "G", "MDF", "MEDE", "MGDPLW", "MHBiDE",
"MHBiDH", "MHBiDK", "MHBiDP", "MHBiDPBi", "MHDE", "MHDELP",
"MHDG", "MHDH", "MHDHBi", "MHDK", "MHDP", "MHDPLG", "MKBiDK",
"MKBiDP", "MKDE", "MKDG", "MKDH", "MKDHBi", "MKDK", "MKDP",
"MKDPBi", "MPBiDGLG", "MPBiDP", "MPDE", "MPDG", "MPDH", "MPDK",
"MPDP", "MPDPLG"), class = "factor"), rm2_mand_E5 = structure(c(5L,
17L, 18L, 7L, 21L, 21L), .Label = c("Bi5Li2", "BR2LO", "BRDOLR",
"i2", "i3", "i4", "i5", "Mi2Di2", "Mi2DO", "Mi2DR", "Mi2DR2",
"Mi3DO", "Mi3DR", "Mi4DO", "Mi5DO", "Mi5DR", "MODO", "MODR",
"MR2Di2", "MR2Di3", "MR2DO", "MR2DOLR", "MR2DR", "MR2DR2",
"MR2DR4", "MR2DRLR", "MR4Di2", "MR4DO", "MR4DOLR", "MR4DR",
"MR4DR4", "MR4DRLR", "MR5DO", "MRDO", "MRDOLR", "MRDR", "R",
"R2"), class = "factor"), rm3_mand_E1 = structure(c(1L, 1L,
1L, 2L, 2L, 2L), .Label = c("1", "2", "CON", "R3En", "R3Pa"
), class = "factor"), rm3_mand_E2 = structure(c(2L, 1L, 1L,
2L, 3L, 3L), .Label = c("1", "2", "3", "4", "CON"), class = "factor"),
rm3_mand_E3 = structure(c(6L, 1L, 1L, 6L, 9L, 9L), .Label = c("A",
"B2L1", "CON", "M1B1D1", "M1B2D1", "M1D1", "M1D1L1", "M2B1D1",
"M2D1", "M2D1L1", "M2D2", "M3D1"), class = "factor"), rm3_mand_E4 = structure(c(3L,
25L, 25L, 44L, 32L, 47L), .Label = c("BHLP", "CON", "Cs",
"E", "G", "K", "MCsLG", "MDF", "MDFLG", "MEDELG", "MEDG",
"MELE", "MGBGDE", "MGBPDK", "MGDK", "MHBiDK", "MHBiDP", "MHDE",
"MHDELG", "MHDG", "MHDGLG", "MHDK", "MHDP", "MHDPLG", "Mi",
"MKBGDG", "MKBiDK", "MKBiDP", "MKDE", "MKDG", "MKDGLG", "MKDK",
"MKDP", "MKDPLG", "MKDW", "MPBGDE", "MPBGDG", "MPBGDP", "MPBiDE",
"MPBiDK", "MPBPDE", "MPDE", "MPDELG", "MPDG", "MPDGLG", "MPDK",
"MPDP", "MPDPLE", "MPDPLG", "MPDWLG", "MWDELG", "MWDP", "P"
), class = "factor"), rm3_mand_E5 = structure(c(3L, 42L,
42L, 23L, 33L, 26L), .Label = c("BR2LO", "CON", "i2", "i3",
"i5", "Mi2BRDR", "Mi2Di2", "Mi2DO", "Mi2DOLR", "Mi2DR", "Mi2DRLR",
"Mi3DO", "Mi3DOLR", "Mi3LR", "Mi4DR", "Mi5DO", "Mi5DR", "Mi5DRLR",
"MOBRDO", "MOBRDR", "MODO", "MODOLR", "MODR", "MODRLR", "MR2BRDR",
"MR2DO", "MR2DOLR", "MR2DR", "MR2DR2", "MR2DR4", "MR2DRLR",
"MR4Di5", "MR4DO", "MR4DR", "MR4DR4", "MR4DRLR", "MRBR2DO",
"MRBRDR", "MRDR", "MRDRLR", "O", "R", "R2"), class = "factor")), row.names = c("99_1_192",
"99_1_194", "99_1_196", "99_1_197", "99_1_198", "99_1_201"), class = "data.frame")

Related

cut.default error in heatmap generation R

I want to generate a heatmap from a 8*6 dataframe. The last row in the dataframe has the information to annotate the columns. Structure of the dataframe is as follows:
heatmap_try <-structure(list(BGC0000041 = structure(c(1L, 2L, 1L, 1L, 1L, 3L
), .Label = c("0", "0.447458977", "a"), class = "factor"), BGC0000128 = structure(c(1L,
1L, 1L, 3L, 2L, 4L), .Label = c("0", "1.785875195", "4.093659107",
"a"), class = "factor"), BGC0000287 = structure(c(1L, 1L, 1L,
3L, 2L, 4L), .Label = c("0", "1.785875195", "4.456229186", "b"
), class = "factor"), BGC0000294 = structure(c(3L, 1L, 2L, 4L,
1L, 5L), .Label = c("0", "2.035046947", "3.230553742", "3.286304185",
"b"), class = "factor"), BGC0000295 = structure(c(1L, 1L, 1L,
2L, 1L, 3L), .Label = c("0", "2.286304185", "c"), class = "factor"),
BGC0000308 = structure(c(4L, 2L, 3L, 5L, 1L, 6L), .Label = c("6.277728291",
"6.313707588", "6.607936616", "6.622871165", "6.64385619",
"c"), class = "factor"), BGC0000323 = structure(c(1L, 2L,
1L, 1L, 1L, 3L), .Label = c("0", "0.447458977", "c"), class = "factor"),
BGC0000328 = structure(c(1L, 2L, 1L, 1L, 1L, 3L), .Label = c("0",
"0.447458977", "c"), class = "factor")), class = "data.frame", row.names = c("Gut",
"Oral", "Anterior_nares", "Retroauricular_crease", "Vagina",
"AL"))
My code for heatmap generation is as follows (I am using pheatmap library):
library(pheatmap)
heatmap_data1 <- heatmap_try[ c(1:5), c(1:8) ]
anotation_data <- as.data.frame(t(heatmap_try[6, ]))
row.names(anotation_data) <- colnames(heatmap_data1)
pheatmap(heatmap_data1, annotation_col = anotation_data, color = colorRampPalette(c("white","blue"))(n=100),cellwidth = 40,cellheight = 6,fontsize_row = 5,cluster_rows = F,cluster_cols = F)
However, I am getting the following error:
Error in cut.default(x, breaks = breaks, include.lowest = T) :
'x' must be numeric
What I am doing wrong?
Thanks!
This is because the columns of heatmap_data1 are factors, they need to be numeric. One way to convert is with:
heatmap_data1_num <- as.data.frame(lapply(heatmap_data1,
function(x) as.numeric(as.character(x))))
# then as before
pheatmap(heatmap_data1_num, annotation_col = anotation_data, color = colorRampPalette(c("white","blue"))(n=100),cellwidth = 40,cellheight = 6,fontsize_row = 5,cluster_rows = F,cluster_cols = F)

How to change from factored data to numeric?

I have this dataframe in factored form:
Data <- structure(list(ID = c("1", "2", "3", "4", "5",
"6"), V1 = structure(c(1L, 1L, 4L, 4L, 4L, 1L), .Label = c("1",
"129", "2", "3", "76"), class = "factor"), V2 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("1", "3"), class = "factor"),
V3 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "1", class = "factor"),
V4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"3"), class = "factor"), V5 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "1", class = "factor"), V6 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "1", class = "factor"), V7 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "1", class = "factor"), V8 = structure(c(2L,
2L, 2L, 2L, 2L, 2L), .Label = c("0", "1", "3"), class = "factor"),
V9 = structure(c(2L, 2L, 3L, 2L, 2L, 2L
), .Label = c("0", "1", "3"), class = "factor"), V10 = structure(c(2L,
2L, 2L, 2L, 2L, 2L), .Label = c("0", "1", "2", "3"), class = "factor"),
V11 = structure(c(2L, 2L, 2L, 2L,
2L, 2L), .Label = c("0", "1"), class = "factor"), V12 = structure(c(1L,
1L, 1L, 1L, 1L, 3L), .Label = c("1", "2", "3"), class = "factor"),
V13 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2", "3"), class = "factor"), V14 = structure(c(1L,
1L, 2L, 1L, 1L, 1L), .Label = c("1", "3"), class = "factor"),
V15 = structure(c(2L, 2L, 2L, 2L, 2L,
2L), .Label = c("0", "1", "3"), class = "factor"), V17 = structure(c(3L,
1L, 3L, 1L, 1L, 3L), .Label = c("1", "2", "3"), class = "factor"),
V18 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2", "3"), class = "factor"), V19 = structure(c(1L,
1L, 2L, 1L, 1L, 1L), .Label = c("1", "3"), class = "factor"),
V20 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"3"), class = "factor"), V21 = structure(c(1L, 3L,
1L, 1L, 3L, 1L), .Label = c("1", "2", "3"), class = "factor"),
V22 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2", "3"), class = "factor"), V23 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("1", "3"), class = "factor"),
V24 = structure(c(1L, 1L, 1L, 1L, 1L, 1L
), .Label = "1", class = "factor"), V25 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("1", "2", "3"), class = "factor"),
V26 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2"), class = "factor"), V27 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "1", class = "factor"), V28 = structure(c(2L,
2L, 2L, 2L, 2L, 2L), .Label = c("0", "1"), class = "factor"),
V29 = structure(c(2L, 2L, 2L, 2L, 2L, 2L), .Label = c("0",
"1"), class = "factor"), V30 = structure(c(2L,
2L, 2L, 2L, 2L, 2L), .Label = c("0", "1"), class = "factor"),
V31 = structure(c(2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("0", "1"), class = "factor"), V32 = structure(c(2L,
2L, 2L, 2L, 2L, 2L), .Label = c("0", "1"), class = "factor"),
Totals = structure(c(1L, 1L, 4L, 1L, 2L, 2L), .Label = c("1",
"2", "3", "5"), class = "factor")), row.names = c(NA, 6L), class = "data.frame")
It is in factored form but I need to change it to numeric form without changing the original dataframe called Data. So, I tried this method:
Data2 <- lapply(Data[c(1:33)], numeric)
This gave me the "invalid length argument" error. So I tried this method after looking up the issue:
Data2 <- lapply(Data[c(1:33)], as.numeric)
Data2 <- as.data.frame(Data2)
I do indeed get a new dataframe, but the data doesn't match what I have in my script. Some numbers change by 1 value, for example. (Where there is a 3, it is a 4. Where there is a 4, there is a 5).
Any other methods to this issue?
EDIT: earlier in my script I convert from character to factor using this method:
Data <- lapply(Data[c(2:33)], factor)
Would it be easier to instead convert to numeric and wait until I am done with all of my analyses to convert to factor?
You need to convert to character first:
Data <- lapply(Data, function(x) as.numeric(as.character(x)))
Try this dplyr approach:
library(dplyr)
#Code
Data2 <- Data %>% mutate(across(2:33,~as.numeric(as.character(.))))
We can use type.convert from base R
Data <- type.convert(Data, as.is = TRUE)

Warning message In `[<-.factor`(`*tmp*`, iseq, value = foo) : invalid factor level, NA generated when trying to add vector to row subset

I'm writing a function that attempts to add values in a single row of a data.frame in several columns at once:
require(stringr)
addPointsToKeyRow = function(df, keyRowNum, searchStringForPointColNames, pointsVector){
colsWithMatchingSearchResults = str_match(colnames(df), searchStringForPointColNames)
pointColNums = (which(!is.na(colsWithMatchingSearchResults)))
pointsVectorCleaned = pointsVector[!is.na(pointsVector)]
print(is.vector(pointsVectorCleaned)) #Returns TRUE
print(is.data.frame(pointsVectorCleaned)) #Returns FALSE
print(pointsVectorCleaned)
if(length(pointsVectorCleaned) == length(pointColNums)){
newDf = data.frame(df, stringsAsFactors = FALSE)
newDf[keyRowNum, pointColNums] = as.character(pointsVectorCleaned)
#for(i in 1:length(pointColNums)){
# newDf[keyRowNum,pointColNums[i]]=as.character(pointsVectorCleaned[i])
#}
print(newDf[keyRowNum,])
}
}
When I apply the function to my data (addPointsToKeyRow(finalDf, which(finalDf[,1]=="key"), "points_q", pointVals)), I get the following warnings:
In [<-.factor(*tmp*, iseq, value = "2") :
invalid factor level, NA generated
I've looked for the error on SO and other sites, and the recommendation always seems to be to make sure your data.frame has stringsAsFactors = FALSE.
I think my issue might be that when I subset the data.frame (newDf[keyRowNum, pointColNums]), it no longer keeps stringsAsFactors = FALSE.
Regardless of whether that's the issue or not, I'd very much welcome some help solving this weird issue. Many thanks in advance!
For the sake of an example, let's say df is:
df = structure(list(first = structure(c(7L, 9L, 5L, 4L, 10L, 2L, 3L,
6L, 1L, 8L), .Label = c("autumn", "spring", "summer", "winter",
"july", "betty", "november", "echo", "victor", "tango"), class = "factor"),
last = structure(c(6L, 2L, 4L, 5L, 1L, 8L, 3L, 9L, 10L, 7L
), .Label = c("brummett1", "do", "drorbaugh", "galeno", "gerber",
"key", "lyons", "pecsok", "perezfranco", "swatt"), class = "factor"),
question1 = structure(c(1L, 1L, 1L, 4L, 6L, 2L, 5L, 3L, 5L,
5L), .Label = c("0", "0.25", "1:02:01", "1:2 50%", "2-Jan",
"50%"), class = "factor"), points_q1 = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "", class = "factor"),
question2 = structure(c(8L, 10L, 6L, 5L, 2L, 3L, 7L, 1L,
4L, 9L), .Label = c(" a | b; A| Aa | Ab; b| ab | bb; the possibility that the offspring will be heterozygous is about 25%. The same goes for the homozygous recessive it is a 1:1:1:1",
"1/4 heterozygous for \xf1a\xee and 0 recessive for \xf1b\xee",
"16-Mar", "2-Jan", "3:1 25%", "4-Jan", "Male=aabb Female=AAbb Heterozygous is going to be 1/2. Homozygous is going to be 1/4.",
"possible offspring genotypes (each with probability of 0.25): AABb AaBb AAbb Aabb. Question is asking about probability of Aabb_ which is 0.25.",
"The square shows Ab Ab_ Bb Bb so 50% or 1/2. ", "Xa Yb (father) crossed with XA Xb (mother) = 1/2 "
), class = "factor"), points_q2 = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "", class = "factor"),
question3 = structure(c(4L, 5L, 3L, 5L, 5L, 5L, 7L, 2L, 6L,
1L), .Label = c("Codominance", "coheritance", "incomplete dominance",
"Incomplete dominance", "Incomplete dominance ", "Incomplete dominance. ",
"Independent Assortment"), class = "factor"), points_q3 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "", class = "factor"),
question4 = structure(c(3L, 4L, 2L, 3L, 6L, 3L, 7L, 1L, 5L,
4L), .Label = c("", "co-dominance", "Codominance", "Codominance ",
"Codominance. ", "Codominant ", "Independent Assortment? (Wrong)"
), class = "factor"), points_q4 = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "", class = "factor"),
question5 = structure(c(2L, 10L, 6L, 4L, 5L, 3L, 8L, 1L,
7L, 9L), .Label = c(" X | Y; X| XX | XY; x| Xx | xY; the percentage will be 25 % or 1/4 the same applies to the son ",
"0 for daughter_ because male can only give non-colorblind X chromosome (because he's not colorblind an only has one X chromosome). 0.25 for both son and colorblind.",
"0.25", "25% for son and 25% for daughter", "25% for the son and 25% for the daughter ",
"4-Jan", "50%", "Father=XY Mother=X2Y Therefore_ by using the punnet square_ I was able to show/understand that the probability of them having a son AND him being colorblind is 1/4.",
"To have a son or daughter is 50/50. To have a colorblind daughter is .25 whereas to have a colorblind son is .75 because it is carried on the X chromosome and the son is much more likely to inherit this because he has less x to work with",
"XcY (father) XC Xc (mother) Daughter is 1/4 son 1/4"), class = "factor"),
points_q5 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = "", class = "factor"), question6 = structure(c(3L,
6L, 7L, 8L, 5L, 2L, 10L, 9L, 4L, 1L), .Label = c("Chromatids ",
"Chromosomes (diploids)", "homologous chromosome pairs",
"Homologous chromosome pairs are being separated. ", "Homologous chromosomes ",
"Homologous pairs ", "homologous pairs of chromosomes", "Homologus Chromosomes ",
"sister chromatids ", "Sister Chromatids?"), class = "factor"),
points_q6 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = "", class = "factor"), question7 = structure(c(6L,
8L, 5L, 7L, 8L, 2L, 3L, 1L, 9L, 4L), .Label = c("", "Chromatids (haploids)",
"Daughter Chromosomes?", "One cell to 2", "sister chromatids",
"Sister chromatids", "Sister Chromatids", "Sister chromatids ",
"Sister chromatids within daughter cells are separating. "
), class = "factor"), points_q7 = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "", class = "factor"),
question8 = structure(c(1L, 4L, 1L, 2L, 4L, 2L, 3L, 6L, 5L,
3L), .Label = c("sister chromatids", "Sister chromatids",
"Sister Chromatids", "Sister chromatids ", "Sister chromatids are held together by the centromeres. In prophase chromosomes become visible. During metaphase chromosomes attach to spindles. During Anaphase the chromosomes are split apart and in telophase the cells start to create cleavage. ",
"sisters chromatides"), class = "factor"), points_q8 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "", class = "factor"),
question9 = structure(c(2L, 4L, 1L, 3L, 4L, 3L, 3L, 2L, 5L,
3L), .Label = c("prohase ", "prophase", "Prophase", "Prophase ",
"They condense during prophase before the rest of the phases. "
), class = "factor"), points_q9 = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "", class = "factor"),
question10 = structure(c(1L, 3L, 1L, 2L, 3L, 2L, 2L, 1L,
4L, 2L), .Label = c("anaphase", "Anaphase", "Anaphase ",
"During anaphase. "), class = "factor"), points_q10 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "", class = "factor"),
question11 = structure(c(3L, 4L, 3L, 4L, 4L, 4L, 4L, 3L,
1L, 2L), .Label = c("During prophase. ", "Telephase ", "telophase",
"Telophase"), class = "factor"), points_q11 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "", class = "factor"),
question12 = structure(c(1L, 3L, 1L, 2L, 3L, 2L, 3L, 1L,
4L, 2L), .Label = c("metaphase", "Metaphase", "Metaphase ",
"Metaphase. "), class = "factor"), points_q12 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "", class = "factor"),
question13 = structure(c(1L, 4L, 1L, 4L, 2L, 4L, 2L, 5L,
3L, 6L), .Label = c("centromere", "Centromere", "Centromere. ",
"Centromeres", "centromeres ", "Cleavage"), class = "factor"),
points_q13 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "", class = "factor")), .Names = c("first",
"last", "question1", "points_q1", "question2", "points_q2", "question3",
"points_q3", "question4", "points_q4", "question5", "points_q5",
"question6", "points_q6", "question7", "points_q7", "question8",
"points_q8", "question9", "points_q9", "question10", "points_q10",
"question11", "points_q11", "question12", "points_q12", "question13",
"points_q13"), row.names = c(NA, -10L), class = "data.frame")
which(finalDf[,1]=="key") is 1.
pointVals is c(NA, "2", "2", "2", "2", "2", "2", "2", "1", "1", "1", "1",
"1", "1")
For clarification, I'd want the final table to look something like:
First Last question1 points_q1 question2 points_q2 etc.
key key 0 2 "possible_offspring_genotypes..." 1 etc.
I have reduced your function based on my understanding , let me know if it gives what you want or if I have misunderstood something
addPointsToKeyRow = function(df, keyRowNum, searchString, pointsVector) {
#Find columns which has searchString in it
cols <- grepl(searchString, colnames(df))
#Check if the columns with searchString and length of pointsVector is the same
if (sum(cols) == length(pointsVector)) {
#Assign the value
df[keyRowNum,cols] <- pointsVector
}
#Return the updated dataframe
df
}
#Convert all the variables in the column from factor to character
df[] <- lapply(df, as.character)
#define the values to be replaced
pointVals <- c("2", "2", "2", "2", "2", "2", "2", "1", "1", "1", "1","1", "1")
#Call the function
df <- addPointsToKeyRow(df, 1, "points_q", pointsval)
#Check the dataframe
df

Select observations in R based on maximum number listed in a column

I hope I've done this correctly! I have two data frames:
teachers = structure(list(Teacher = c(123L, 123L, 123L, 123L, 124L),
tStudents = c(3L, 3L, 4L, 3L, 4L), Term = c(1801L, 1802L, 1801L, 1803L, 1802L),
Course = structure(c(5L, 6L, 7L, 6L, 8L), .Label = c("ENGG",
"ENGG2", "LITT", "LITT2", "MATH", "MATH2", "PHYS", "SCIE"
), class = "factor")), .Names = c("Teacher", "tStudents", "Term", "Course"), row.names = c(NA, 5L), class = "data.frame")
enrols = structure(list(UniqueStudent = structure(c(3L, 2L, 1L, 5L, 4L),
.Label = c("1801-ENGG-N1-abcd1#abc.edu.au", "1801-MATH-C1-abcd1#abc.edu.au","1801-PHYS-L1-abcd1#abc.edu.au", "1802-MATH2-G1-abcd1#abc.edu.au", "1802-SCIE-K2-abcd1#abc.edu.au"), class = "factor"), Term = c(1801L,1801L, 1801L, 1802L, 1802L), Student.Email.Addresses = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "abcd1#abc.edu.au", class = "factor"), ID = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "s12344", class = "factor"),
Gender.Description = structure(c(1L, 1L, 1L, 1L, 1L), .Label = "M", class = "factor"),
Age = c(12L, 12L, 12L, 12L, 12L), Program.Short.Description = structure(c(1L,
1L, 1L, 1L, 1L), .Label = "LSC1", class = "factor"), Term.CC.CN = structure(c(3L,
2L, 1L, 5L, 4L), .Label = c("1801-ENGG-N1", "1801-MATH-C1",
"1801-PHYS-L1", "1802-MATH2-G1", "1802-SCIE-K2"), class = "factor"),
Course.Code = structure(c(4L, 2L, 1L, 5L, 3L), .Label = c("ENGG",
"MATH", "MATH2", "PHYS", "SCIE"), class = "factor"), Class.Number = structure(c(4L,
1L, 5L, 3L, 2L), .Label = c("C1", "G1", "K2", "L1", "N1"), class = "factor"),
Teacher = c(123L, 123L, 125L, 124L, 123L)), .Names = c("UniqueStudent", "Term", "Student.Email.Addresses", "ID", "Gender.Description", "Age", "Program.Short.Description", "Term.CC.CN", "Course.Code", "Class.Number", "Teacher"), row.names = c(NA, 5L), class = "data.frame")
teachers$tStudents lists the maximum number of students allowed to be allocated to a teacher per Term and Course. I've also pre-merged the Course enrolments in the "enrols" data to list the Teachers for each course.
So, what I need to do is create class lists from the enrols data using the teachers data by c("teacher", "Term", "Course") but my class lists can only select a maximum value of students based on the number listed in teachers$tStudents. Ideally, I'd also like to select a representative distribution of students so that the new class lists have both genders, different ages and are from different Program.Short.Description.
I've tried merging in different ways in dplyr and can create full lists with all students but haven't been able to use the teachers$tStudents column to limit the number of observations to select. Is this possible?

What reshaping problems can melt/cast not solve in a single step?

reshape2 is a package which allows an powerful array of data transformations, through its two-part melt/cast approach. However, like all tools it embeds assumptions which limit the cases it can handle.
What data reshaping problem can reshape2 not handle in its current form?
The ideal answer will include:
A description of the type of use cases where this data shape is typically found
Sample data
Code to accomplish the transformation (ideally using as much of the transformation with reshape2 as possible)
Example
"Wide" data is common in panel applications.
melt.wide <- function(data, id.vars, new.names, sep=".", variable.name="variable", ... ) {
# Guess number of variables currently wide
colnames(data) <- sub( paste0(sep,"$"), "", colnames(data) )
wide.vars <- colnames(data)[grep( sep, colnames(data) )]
n.wide <- str_count( wide.vars, sep )
stopifnot(length(new.names)==unique(n.wide))
# Melt
data.melt <- melt(data,id.vars=id.vars,measure.vars=wide.vars,...)
new <- stack.list(str_split(data.melt$variable,sep))
colnames(new) <- c(variable.name,new.names)
data.melt <- subset(data.melt,select=c(-variable))
cbind(data.melt,new)
}
choice.vars <- colnames(res)[grep("_",colnames(res))]
melt.wide( subset(res,select=c("WorkerId",choice.vars)), id.vars="WorkerId", new.names=c("set","option"), sep="_")
The new function returns a melted object that can then be *cast.
Where the data is:
so <- structure(list(WorkerId = c(12L, 13L, 27L, 25L, 30L, 8L), pio_1_1 = structure(c(2L,
1L, 2L, 1L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"),
pio_1_2 = structure(c(1L, 2L, 2L, 2L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), pio_1_3 = structure(c(1L, 1L,
1L, 1L, 2L, 1L), .Label = c("No", "Yes"), class = "factor"),
pio_1_4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"),
pio_2_1 = structure(c(1L, 2L, 2L, 1L, 1L, 2L), .Label = c("No",
"Yes"), class = "factor"), pio_2_2 = structure(c(1L, 1L,
1L, 2L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
pio_2_3 = structure(c(2L, 2L, 2L, 2L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), pio_2_4 = structure(c(1L, 1L,
1L, 1L, 1L, 1L), .Label = "No", class = "factor"), pio_3_1 = structure(c(2L,
2L, 2L, 2L, 2L, 1L), .Label = c("No", "Yes"), class = "factor"),
pio_3_2 = structure(c(2L, 1L, 1L, 1L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), pio_3_3 = structure(c(2L, 1L,
2L, 1L, 1L, 2L), .Label = c("No", "Yes"), class = "factor"),
pio_3_4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"),
pio_4_1 = structure(c(2L, 1L, 2L, 2L, 1L, 2L), .Label = c("No",
"Yes"), class = "factor"), pio_4_2 = structure(c(2L, 2L,
2L, 1L, 2L, 1L), .Label = c("No", "Yes"), class = "factor"),
pio_4_3 = structure(c(1L, 2L, 1L, 1L, 2L, 2L), .Label = c("No",
"Yes"), class = "factor"), pio_4_4 = structure(c(1L, 1L,
1L, 1L, 1L, 1L), .Label = "No", class = "factor"), caremgmt_1_1 = structure(c(2L,
2L, 1L, 2L, 1L, 2L), .Label = c("No", "Yes"), class = "factor"),
caremgmt_1_2 = structure(c(1L, 2L, 2L, 2L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), caremgmt_1_3 = structure(c(1L,
1L, 1L, 1L, 2L, 1L), .Label = c("No", "Yes"), class = "factor"),
caremgmt_1_4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"),
caremgmt_2_1 = structure(c(2L, 2L, 2L, 2L, 2L, 2L), .Label = c("No",
"Yes"), class = "factor"), caremgmt_2_2 = structure(c(1L,
2L, 1L, 2L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
caremgmt_2_3 = structure(c(2L, 1L, 2L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), caremgmt_2_4 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"), caremgmt_3_1 = structure(c(2L,
1L, 2L, 1L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"),
caremgmt_3_2 = structure(c(2L, 1L, 2L, 2L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), caremgmt_3_3 = structure(c(2L,
2L, 2L, 2L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"),
caremgmt_3_4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"),
caremgmt_4_1 = structure(c(1L, 1L, 2L, 1L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), caremgmt_4_2 = structure(c(2L,
2L, 2L, 2L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
caremgmt_4_3 = structure(c(1L, 1L, 1L, 1L, 1L, 2L), .Label = c("No",
"Yes"), class = "factor"), caremgmt_4_4 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"), prev_1_1 = structure(c(1L,
1L, 2L, 1L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"),
prev_1_2 = structure(c(1L, 2L, 1L, 2L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), prev_1_3 = structure(c(2L, 1L,
1L, 2L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"),
prev_1_4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"),
prev_2_1 = structure(c(1L, 1L, 2L, 1L, 2L, 2L), .Label = c("No",
"Yes"), class = "factor"), prev_2_2 = structure(c(2L, 2L,
1L, 2L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
prev_2_3 = structure(c(1L, 2L, 1L, 1L, 2L, 2L), .Label = c("No",
"Yes"), class = "factor"), prev_2_4 = structure(c(1L, 1L,
1L, 1L, 1L, 1L), .Label = "No", class = "factor"), prev_3_1 = structure(c(1L,
2L, 1L, 1L, 2L, 1L), .Label = c("No", "Yes"), class = "factor"),
prev_3_2 = structure(c(1L, 1L, 2L, 1L, 2L, 2L), .Label = c("No",
"Yes"), class = "factor"), prev_3_3 = structure(c(2L, 2L,
1L, 2L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"),
prev_3_4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "No", class = "factor"),
prev_4_1 = structure(c(1L, 2L, 2L, 1L, 2L, 2L), .Label = c("No",
"Yes"), class = "factor"), prev_4_2 = structure(c(1L, 1L,
2L, 1L, 2L, 2L), .Label = c("No", "Yes"), class = "factor"),
prev_4_3 = structure(c(1L, 1L, 1L, 2L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), prev_4_4 = structure(c(1L, 1L,
1L, 1L, 1L, 1L), .Label = "No", class = "factor"), price_1_1 = structure(c(30L,
12L, 1L, 16L, 28L, 17L), .Label = c("$2,500", "$2,504", "$2,507",
"$2,509", "$2,512", "$2,513", "$2,515", "$2,526", "$2,547",
"$2,548", "$2,578", "$2,588", "$2,594", "$2,605", "$2,607",
"$2,617", "$2,618", "$2,622", "$2,635", "$2,649", "$2,670",
"$2,672", "$2,679", "$2,681", "$2,698", "$2,704", "$2,721",
"$2,782", "$2,851", "$2,884", "$2,919", "$2,925", "$2,935",
"$3,022"), class = "factor"), price_1_2 = structure(c(1L,
19L, 5L, 17L, 7L, 1L), .Label = c("$2,500", "$2,501", "$2,502",
"$2,504", "$2,513", "$2,515", "$2,517", "$2,532", "$2,535",
"$2,558", "$2,564", "$2,571", "$2,575", "$2,578", "$2,608",
"$2,633", "$2,634", "$2,675", "$2,678", "$2,687", "$2,730",
"$2,806", "$2,827", "$2,848", "$2,891", "$2,901", "$2,923",
"$2,933", "$2,937", "$2,958", "$2,987"), class = "factor"),
price_1_3 = structure(c(11L, 1L, 1L, 8L, 19L, 14L), .Label = c("$2,500",
"$2,504", "$2,507", "$2,513", "$2,516", "$2,518", "$2,564",
"$2,579", "$2,580", "$2,583", "$2,584", "$2,592", "$2,604",
"$2,608", "$2,639", "$2,643", "$2,646", "$2,665", "$2,667",
"$2,695", "$2,698", "$2,709", "$2,710", "$2,713", "$2,714",
"$2,750", "$2,757", "$2,876", "$2,978", "$2,984", "$3,024",
"$3,059"), class = "factor"), price_1_4 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "$2,500", class = "factor"),
price_2_1 = structure(c(27L, 32L, 19L, 22L, 4L, 26L), .Label = c("$2,500",
"$2,504", "$2,505", "$2,510", "$2,511", "$2,512", "$2,515",
"$2,517", "$2,518", "$2,529", "$2,533", "$2,537", "$2,551",
"$2,553", "$2,574", "$2,593", "$2,600", "$2,605", "$2,608",
"$2,612", "$2,613", "$2,618", "$2,639", "$2,657", "$2,714",
"$2,730", "$2,747", "$2,764", "$2,771", "$2,773", "$2,813",
"$2,859", "$2,901", "$3,019", "$3,037"), class = "factor"),
price_2_2 = structure(c(12L, 2L, 1L, 27L, 1L, 7L), .Label = c("$2,500",
"$2,502", "$2,510", "$2,514", "$2,515", "$2,516", "$2,517",
"$2,518", "$2,520", "$2,521", "$2,523", "$2,536", "$2,544",
"$2,575", "$2,583", "$2,592", "$2,602", "$2,624", "$2,644",
"$2,652", "$2,662", "$2,677", "$2,720", "$2,761", "$2,765",
"$2,770", "$2,772", "$2,835", "$2,873", "$2,911", "$2,950",
"$2,962"), class = "factor"), price_2_3 = structure(c(32L,
1L, 8L, 33L, 29L, 11L), .Label = c("$2,500", "$2,506", "$2,507",
"$2,510", "$2,511", "$2,512", "$2,515", "$2,517", "$2,527",
"$2,528", "$2,540", "$2,554", "$2,562", "$2,565", "$2,568",
"$2,581", "$2,597", "$2,611", "$2,616", "$2,631", "$2,652",
"$2,663", "$2,671", "$2,672", "$2,685", "$2,727", "$2,731",
"$2,742", "$2,771", "$2,778", "$2,781", "$2,970", "$2,984",
"$2,986", "$3,030"), class = "factor"), price_2_4 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "$2,500", class = "factor"),
price_3_1 = structure(c(24L, 1L, 28L, 7L, 18L, 21L), .Label = c("$2,500",
"$2,501", "$2,503", "$2,505", "$2,509", "$2,512", "$2,535",
"$2,537", "$2,542", "$2,553", "$2,556", "$2,560", "$2,561",
"$2,574", "$2,584", "$2,618", "$2,624", "$2,629", "$2,637",
"$2,664", "$2,761", "$2,840", "$2,875", "$2,883", "$2,891",
"$2,933", "$2,953", "$2,978", "$3,039", "$3,043", "$3,067"
), class = "factor"), price_3_2 = structure(c(3L, 1L, 5L,
19L, 25L, 9L), .Label = c("$2,500", "$2,501", "$2,503", "$2,504",
"$2,512", "$2,517", "$2,540", "$2,543", "$2,546", "$2,560",
"$2,567", "$2,573", "$2,586", "$2,592", "$2,594", "$2,603",
"$2,604", "$2,606", "$2,628", "$2,633", "$2,635", "$2,693",
"$2,696", "$2,714", "$2,734", "$2,739", "$2,770", "$2,791",
"$2,797", "$2,936", "$2,967", "$3,021", "$3,024"), class = "factor"),
price_3_3 = structure(c(26L, 7L, 5L, 32L, 10L, 24L), .Label = c("$2,500",
"$2,501", "$2,502", "$2,505", "$2,506", "$2,507", "$2,508",
"$2,509", "$2,512", "$2,515", "$2,519", "$2,547", "$2,556",
"$2,574", "$2,587", "$2,592", "$2,608", "$2,616", "$2,621",
"$2,635", "$2,638", "$2,667", "$2,671", "$2,688", "$2,694",
"$2,700", "$2,717", "$2,759", "$2,809", "$2,864", "$2,891",
"$2,912", "$3,011", "$3,012"), class = "factor"), price_3_4 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "$2,500", class = "factor"),
price_4_1 = structure(c(29L, 13L, 16L, 24L, 33L, 19L), .Label = c("$2,500",
"$2,505", "$2,506", "$2,508", "$2,511", "$2,525", "$2,549",
"$2,562", "$2,577", "$2,582", "$2,586", "$2,591", "$2,621",
"$2,636", "$2,654", "$2,670", "$2,722", "$2,726", "$2,733",
"$2,744", "$2,745", "$2,755", "$2,768", "$2,805", "$2,817",
"$2,827", "$2,835", "$2,888", "$2,925", "$2,959", "$3,001",
"$3,027", "$3,061", "$3,071"), class = "factor"), price_4_2 = structure(c(33L,
31L, 21L, 16L, 25L, 13L), .Label = c("$2,500", "$2,502",
"$2,503", "$2,505", "$2,506", "$2,511", "$2,513", "$2,516",
"$2,529", "$2,539", "$2,547", "$2,554", "$2,557", "$2,562",
"$2,567", "$2,579", "$2,581", "$2,583", "$2,585", "$2,591",
"$2,612", "$2,629", "$2,640", "$2,670", "$2,695", "$2,726",
"$2,737", "$2,788", "$2,790", "$2,798", "$2,852", "$3,031",
"$3,063"), class = "factor"), price_4_3 = structure(c(4L,
30L, 4L, 19L, 1L, 27L), .Label = c("$2,500", "$2,504", "$2,507",
"$2,509", "$2,511", "$2,512", "$2,514", "$2,516", "$2,543",
"$2,552", "$2,562", "$2,575", "$2,578", "$2,581", "$2,594",
"$2,614", "$2,615", "$2,617", "$2,636", "$2,640", "$2,641",
"$2,652", "$2,749", "$2,755", "$2,805", "$2,812", "$2,867",
"$2,906", "$2,910", "$2,917", "$2,924", "$2,927", "$2,961",
"$3,028", "$3,053", "$3,054"), class = "factor"), price_4_4 = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "$2,500", class = "factor"),
plan_1_1 = structure(c(2L, 2L, 2L, 1L, 1L, 2L), .Label = c("",
"X"), class = "factor"), plan_1_2 = structure(c(1L, 1L, 1L,
2L, 1L, 1L), .Label = c("", "X"), class = "factor"), plan_1_3 = structure(c(1L,
1L, 1L, 1L, 2L, 1L), .Label = c("", "X"), class = "factor"),
plan_1_4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("",
"X"), class = "factor"), plan_2_1 = structure(c(1L, 2L, 1L,
2L, 2L, 2L), .Label = c("", "X"), class = "factor"), plan_2_2 = structure(c(1L,
1L, 2L, 1L, 1L, 1L), .Label = c("", "X"), class = "factor"),
plan_2_3 = structure(c(2L, 1L, 1L, 1L, 2L, 1L), .Label = c("",
"X"), class = "factor"), plan_2_4 = structure(c(1L, 1L, 1L,
1L, 1L, 1L), .Label = c("", "X"), class = "factor"), plan_3_1 = structure(c(1L,
2L, 1L, 1L, 2L, 1L), .Label = c("", "X"), class = "factor"),
plan_3_2 = structure(c(1L, 1L, 1L, 2L, 1L, 1L), .Label = c("",
"X"), class = "factor"), plan_3_3 = structure(c(2L, 1L, 1L,
1L, 1L, 2L), .Label = c("", "X"), class = "factor"), plan_3_4 = structure(c(1L,
1L, 2L, 1L, 1L, 1L), .Label = c("", "X"), class = "factor"),
plan_4_1 = structure(c(2L, 2L, 1L, 1L, 1L, 1L), .Label = c("",
"X"), class = "factor"), plan_4_2 = structure(c(2L, 1L, 1L,
2L, 1L, 1L), .Label = c("", "X"), class = "factor"), plan_4_3 = structure(c(1L,
1L, 1L, 1L, 2L, 2L), .Label = c("", "X"), class = "factor"),
plan_4_4 = structure(c(1L, 1L, 2L, 1L, 1L, 1L), .Label = c("",
"X"), class = "factor")), .Names = c("WorkerId", "pio_1_1",
"pio_1_2", "pio_1_3", "pio_1_4", "pio_2_1", "pio_2_2", "pio_2_3",
"pio_2_4", "pio_3_1", "pio_3_2", "pio_3_3", "pio_3_4", "pio_4_1",
"pio_4_2", "pio_4_3", "pio_4_4", "caremgmt_1_1", "caremgmt_1_2",
"caremgmt_1_3", "caremgmt_1_4", "caremgmt_2_1", "caremgmt_2_2",
"caremgmt_2_3", "caremgmt_2_4", "caremgmt_3_1", "caremgmt_3_2",
"caremgmt_3_3", "caremgmt_3_4", "caremgmt_4_1", "caremgmt_4_2",
"caremgmt_4_3", "caremgmt_4_4", "prev_1_1", "prev_1_2", "prev_1_3",
"prev_1_4", "prev_2_1", "prev_2_2", "prev_2_3", "prev_2_4", "prev_3_1",
"prev_3_2", "prev_3_3", "prev_3_4", "prev_4_1", "prev_4_2", "prev_4_3",
"prev_4_4", "price_1_1", "price_1_2", "price_1_3", "price_1_4",
"price_2_1", "price_2_2", "price_2_3", "price_2_4", "price_3_1",
"price_3_2", "price_3_3", "price_3_4", "price_4_1", "price_4_2",
"price_4_3", "price_4_4", "plan_1_1", "plan_1_2", "plan_1_3",
"plan_1_4", "plan_2_1", "plan_2_2", "plan_2_3", "plan_2_4", "plan_3_1",
"plan_3_2", "plan_3_3", "plan_3_4", "plan_4_1", "plan_4_2", "plan_4_3",
"plan_4_4"), row.names = c(NA, 6L), class = "data.frame")
... almost a year later...
This came to mind the other day, and I have a sneaking suspicion that it is what you tried to show in your example, but unfortunately, your example code doesn't run!
melt sometimes takes things a bit too far for me when making my data "long". Sometimes, even though it is not what would necessarily be called "tidy data", I prefer to have a "semi-long" data.frame. This is easily achieved using base R's reshape, but requires a few extra steps with the "reshape2" package, as demonstrated below:
Prerequisite: sample data.
set.seed(1)
myDf <- data.frame(
ID.1 = sample(letters[1:5], 5, replace = TRUE),
ID.2 = 1:5,
V.1 = sample(10:14, 5, replace = TRUE),
V.2 = sample(5:9, 5, replace = TRUE),
V.3 = sample(3:14, 5, replace = TRUE),
W.1 = sample(LETTERS, 5, replace = TRUE),
W.2 = sample(LETTERS, 5, replace = TRUE),
W.3 = sample(LETTERS, 5, replace = TRUE)
)
myDf
# ID.1 ID.2 V.1 V.2 V.3 W.1 W.2 W.3
# 1 b 1 14 6 8 Y K M
# 2 b 2 14 5 11 F A P
# 3 c 3 13 8 14 Q J M
# 4 e 4 13 6 7 D W E
# 5 b 5 10 8 12 G I V
The "semi-long" output that I'm looking for. Easily achieved with base R's reshape.
reshape(myDf, direction = "long", idvar=1:2, varying = 3:ncol(myDf))
# ID.1 ID.2 time V W
# b.1.1 b 1 1 14 Y
# b.2.1 b 2 1 14 F
# c.3.1 c 3 1 13 Q
# e.4.1 e 4 1 13 D
# b.5.1 b 5 1 10 G
# b.1.2 b 1 2 6 K
# b.2.2 b 2 2 5 A
# c.3.2 c 3 2 8 J
# e.4.2 e 4 2 6 W
# b.5.2 b 5 2 8 I
# b.1.3 b 1 3 8 M
# b.2.3 b 2 3 11 P
# c.3.3 c 3 3 14 M
# e.4.3 e 4 3 7 E
# b.5.3 b 5 3 12 V
melt is great if you wanted the equivalent of stack, especially since stack discards all factor variables, which is frustrating when read.table and family defaults to stringsAsFactors = TRUE. (You can make it work, but you need to convert the relevant columns to character before you can use stack). But, it is not what I'm looking for, in particular because of how it has handled the "variable" column.
library(reshape2)
myDfL <- melt(myDf, id.vars=1:2)
head(myDfL)
# ID.1 ID.2 variable value
# 1 b 1 V.1 14
# 2 b 2 V.1 14
# 3 c 3 V.1 13
# 4 e 4 V.1 13
# 5 b 5 V.1 10
# 6 b 1 V.2 6
To fix this, one needs to first split the "variable" column, and then use dcast to get the same format of output as you would get from reshape.
myDfL <- cbind(myDfL, colsplit(myDfL$variable, "\\.", names=c("var", "time")))
dcast(myDfL, ID.1 + ID.2 + time ~ var, value.var="value")
# ID.1 ID.2 time V W
# 1 b 1 1 14 Y
# 2 b 1 2 6 K
# 3 b 1 3 8 M
# 4 b 2 1 14 F
# 5 b 2 2 5 A
# 6 b 2 3 11 P
# 7 b 5 1 10 G
# 8 b 5 2 8 I
# 9 b 5 3 12 V
# 10 c 3 1 13 Q
# 11 c 3 2 8 J
# 12 c 3 3 14 M
# 13 e 4 1 13 D
# 14 e 4 2 6 W
# 15 e 4 3 7 E

Resources