I have data like wherein A* represents the place with time format (01:01) as column names and the row names like 1C or 9D represent the individual ID.
structure(list(V1 = c("1C", "9D", "9F", "9H", "9S", "9T", "9Y"
), `A*01:01` = c(NA, NA, "1", NA, NA, NA, NA), `A*02:01` = c(NA,
NA, "1", NA, NA, NA, NA), `A*02:02` = c(NA, NA, "1", NA, NA,
NA, NA), `A*02:03` = c(NA, NA, "1", NA, NA, NA, NA), `A*02:05` = c(NA,
NA, NA, NA, NA, NA, "1"), `A*02:06` = c(NA, NA, NA, NA, NA, NA,
"1"), `A*03:01` = c(NA, NA, "1", NA, NA, NA, NA), `A*11:01` = c(NA,
NA, NA, NA, NA, NA, "1"), `A*11:02` = c(NA, NA, NA, NA, NA, NA,
"1"), `A*23:01` = c(NA, NA, NA, NA, "1", NA, NA), `A*23:02` = c(NA,
NA, NA, NA, "1", NA, NA), `A*24:02` = c(NA, NA, NA, NA, "1",
NA, NA), `A*24:03` = c(NA, NA, NA, NA, "1", NA, NA), `A*25:01` = c(NA,
NA, NA, NA, NA, NA, "1"), `A*26:01` = c(NA, NA, NA, NA, NA, NA,
"1"), `A*29:01` = c(NA, NA, NA, NA, NA, "1", NA), `A*29:02` = c(NA,
NA, NA, NA, NA, "1", NA), `A*30:01` = c(NA, NA, NA, NA, "1",
NA, NA), `A*30:02` = c(NA, NA, NA, NA, "1", NA, NA), `A*31:01` = c(NA,
NA, NA, NA, NA, "1", NA), `A*32:01` = c(NA, NA, "1", NA, NA,
NA, NA), `A*33:01` = c(NA, NA, NA, NA, NA, "1", NA), `A*33:03` = c(NA,
NA, NA, NA, NA, "1", NA), `A*34:01` = c(NA, NA, NA, NA, NA, NA,
"1"), `A*34:02` = c(NA, NA, NA, NA, NA, NA, "1"), `A*36:01` = c(NA,
NA, "1", NA, NA, NA, NA), `A*43:01` = c(NA, NA, NA, NA, NA, NA,
"1"), `A*66:01` = c(NA, NA, NA, NA, NA, NA, "1"), `A*66:02` = c(NA,
NA, NA, NA, NA, NA, "1"), `A*68:01` = c(NA, NA, NA, NA, NA, NA,
"1"), `A*68:02` = c(NA, NA, NA, NA, NA, NA, "1"), `A*69:01` = c(NA,
NA, NA, NA, NA, NA, "1"), `A*74:01` = c(NA, NA, "1", NA, NA,
NA, NA), `A*80:01` = c(NA, NA, "1", NA, NA, NA, NA), `B*07:02` = c(NA,
NA, NA, NA, NA, NA, "1"), `B*07:03` = c(NA, NA, NA, NA, NA, NA,
"1"), `B*08:01` = c(NA, "1", NA, NA, NA, NA, NA), `B*13:01` = c(NA,
NA, NA, NA, NA, NA, "1"), `B*13:02` = c(NA, NA, NA, NA, NA, NA,
"1"), `B*14:01` = c(NA, NA, NA, NA, NA, NA, "1"), `B*14:02` = c(NA,
NA, NA, NA, NA, NA, "1"), `B*14:05` = c(NA, NA, NA, NA, NA, NA,
"1"), `B*14:06` = c(NA, NA, NA, NA, NA, NA, "1"), `B*15:01` = c(NA,
NA, NA, NA, NA, NA, "1"), `B*15:02` = c(NA, NA, NA, NA, NA, NA,
"1"), `B*15:03` = c(NA, NA, NA, NA, NA, NA, "1"), `B*15:10` = c(NA,
NA, NA, NA, NA, NA, "1"), `B*15:11` = c(NA, NA, NA, NA, NA, NA,
"1"), `B*15:12` = c(NA, NA, NA, NA, NA, NA, "1"), `B*15:13` = c(NA,
NA, NA, NA, NA, NA, "1"), `B*15:16` = c(NA, NA, NA, NA, NA, NA,
"1"), `B*15:18` = c(NA, NA, NA, NA, NA, NA, "1"), `B*18:01` = c(NA,
NA, NA, "1", NA, NA, NA), `B*27:03` = c(NA, NA, NA, "1", NA,
NA, NA), `B*27:05` = c(NA, NA, NA, "1", NA, NA, NA), `B*27:08` = c(NA,
NA, NA, "1", NA, NA, NA), `B*35:01` = c(NA, NA, NA, NA, NA, NA,
"1"), `B*35:08` = c(NA, NA, NA, NA, NA, NA, "1"), `B*37:01` = c(NA,
NA, NA, "1", NA, NA, NA), `B*38:01` = c(NA, NA, NA, NA, NA, NA,
"1"), `B*39:01` = c(NA, NA, NA, NA, NA, NA, "1"), `B*39:05` = c(NA,
NA, NA, NA, NA, NA, "1"), `B*40:01` = c(NA, NA, NA, "1", NA,
NA, NA), `B*40:02` = c(NA, NA, NA, "1", NA, NA, NA), `B*40:05` = c(NA,
NA, NA, "1", NA, NA, NA), `B*40:06` = c(NA, NA, NA, "1", NA,
NA, NA), `B*41:01` = c(NA, NA, NA, "1", NA, NA, NA), `B*41:02` = c(NA,
NA, NA, "1", NA, NA, NA), `B*42:01` = c(NA, NA, NA, NA, NA, NA,
"1"), `B*44:02` = c(NA, NA, NA, NA, NA, NA, "1"), `B*44:03` = c(NA,
NA, NA, NA, NA, NA, "1"), `B*45:01` = c(NA, NA, NA, "1", NA,
NA, NA), `B*46:01` = c(NA, NA, NA, NA, NA, NA, "1"), `B*47:01` = c(NA,
NA, NA, NA, NA, NA, "1"), `B*48:01` = c(NA, NA, NA, NA, NA, NA,
"1"), `B*49:01` = c(NA, NA, NA, "1", NA, NA, NA), `B*50:01` = c(NA,
NA, NA, "1", NA, NA, NA), `B*51:01` = c(NA, NA, NA, NA, NA, NA,
"1"), `B*51:02` = c(NA, NA, NA, NA, NA, NA, "1"), `B*52:01` = c(NA,
NA, NA, NA, NA, NA, "1"), `B*53:01` = c(NA, NA, NA, NA, NA, NA,
"1"), `B*54:01` = c(NA, NA, NA, NA, NA, NA, "1"), `B*55:01` = c(NA,
NA, NA, NA, NA, NA, "1"), `B*56:01` = c(NA, NA, NA, NA, NA, NA,
"1"), `B*57:01` = c(NA, NA, NA, NA, NA, NA, "1"), `B*57:03` = c(NA,
NA, NA, NA, NA, NA, "1"), `B*58:01` = c(NA, NA, NA, NA, NA, NA,
"1"), `B*59:01` = c(NA, NA, NA, NA, NA, NA, "1"), `B*67:01` = c(NA,
NA, NA, NA, NA, NA, "1"), `B*73:01` = c(NA, NA, NA, "1", NA,
NA, NA), `B*78:01` = c(NA, NA, NA, NA, NA, NA, "1"), `B*81:01` = c(NA,
NA, NA, NA, NA, NA, "1"), `B*82:01` = c(NA, NA, NA, NA, NA, NA,
"1"), `B*82:02` = c(NA, NA, NA, NA, NA, NA, "1"), `C*01:02` = c("1",
NA, "1", NA, NA, NA, NA), `C*02:02` = c("1", NA, NA, NA, NA,
NA, "1"), `C*02:10` = c("1", NA, NA, NA, NA, NA, "1"), `C*03:02` = c(NA,
NA, NA, NA, NA, NA, "1"), `C*03:03` = c(NA, NA, NA, NA, NA, NA,
"1"), `C*03:04` = c(NA, NA, NA, NA, NA, NA, "1"), `C*04:01` = c(NA,
NA, NA, NA, "1", NA, NA), `C*04:03` = c(NA, NA, NA, NA, NA, NA,
"1"), `C*05:01` = c("1", NA, NA, NA, NA, NA, "1"), `C*06:02` = c("1",
"1", NA, NA, NA, NA, NA), `C*07:01` = c("1", "1", NA, NA, NA,
NA, NA), `C*07:02` = c("1", "1", NA, NA, NA, NA, NA), `C*07:04` = c("1",
"1", NA, NA, NA, NA, NA), `C*08:01` = c("1", NA, NA, NA, NA,
NA, "1"), `C*08:02` = c("1", NA, NA, NA, NA, NA, "1"), `C*12:02` = c("1",
NA, NA, NA, NA, NA, "1"), `C*12:03` = c("1", NA, NA, NA, NA,
NA, "1"), `C*14:02` = c("1", NA, NA, NA, "1", NA, NA), `C*15:02` = c("1",
NA, NA, NA, NA, NA, "1"), `C*16:01` = c("1", NA, NA, NA, NA,
NA, "1"), `C*17:01` = c(NA, NA, NA, NA, NA, NA, "1"), `C*18:01` = c("1",
"1", NA, NA, NA, NA, NA), `C*18:02` = c("1", "1", NA, NA, NA,
NA, NA)), row.names = c("1C", "9D", "9F", "9H", "9S", "9T", "9Y"
), class = "data.frame")
I am using the following code to process the data, which is working fine for column 2. But I have 16000 columns and 400 rows in my real data. I want to use "for loop" or "sapply" in R to perform processing. A quick solution is much appreciated.
LA <- dat[!is.na(dat[,2]),]
LA<-LA[,1]
res <- gsub("[[:digit:]]","",LA)
pos <- gsub("[[:alpha:]]","",LA)
LA_sep <- data.frame(res, pos)
LA_sep$res <- paste0(LA_sep$res, "&")
LA_sep$pos <- paste0(LA_sep$pos, "&")
LA_sep <- as.data.frame(t(LA_sep))
LA_sep <- apply(LA_sep[1:ncol(LA_sep)], 1, function(x) paste(na.omit(x), collapse = "")) ## nrow(ma) ## ncol(ma) ## NCOL(1:12) ## NROW(1:12)
write.csv(LA_sep, "2.csv")
When I run the above code on big data, I got the following output for 2nd column (A01:01). First I want to extract the column V1 values where column 2, 3, 4 .. and so on having values 1 for each rows. Then split the digits and characters and saved as "res" and "pos" seprated by "&" for each row having value 1 for each columns (A01:01, A*02:01, A02:02, ......)
res F&K&M&Q&E&R&A&R&N&A&N&N&H&N&M&H&S&A&N&T&N&N&G&T&G&T&T&TL&D&I&I&Y&F&R&D&M&I&K&K&R&A&H&H&A&H&A&R&R&R&G&D&G&P&I&E&L
pos 9&44&45&62&63&65&69&65&66&69&66&66&70&66&67&70&71&76&77&80&77&77&79&80&79&80&80&80&81&90&95&97&99&109&114&116&138&142&144&144&145&149&151&151&152&151&152&156&163&163&167&166&167&193&194&275&276
So, it is very difficult to do this for each column manually or without a loop for big data. Every output should be saved as column names as a separate file.
Related
I am importing multiple excel files. The files have a non-standard structure, but all have the required data after a row of headers, midway down the rows of the data frame.
Here is a MWE:
df= structure(list(...1 = c("CPET Results", NA, "Operator", NA, NA,
"Patient data", NA, "Administrative Data", "ID", "Title", "Last Name",
"First Name", "Name Addition", "Sex", "Date of Birth", NA, NA,
"Biological and Medical Baseline Data", "Height", "Weight", "Mask",
"Race", "Body Fat", "Hip/Waist Ratio", "BMI", "Estimated Fitness Level",
"BSA", "Hct", "Hb", "Medication that changes the Heart Rate",
"Medication", "Existing Medical Conditions", NA, NA, NA, "Test data",
"Start Time", "Duration", "CPET device", "Serial number", "Firmware version",
"Flow Sensor", "Temperature", "Barometric Pressure", "Humidity",
NA, NA, NA, "Variable", "V'O2", "V'CO2", "V'O2/kg", "V'O2/HR",
"HR", "V'E/V'O2", "V'E/V'CO2", "V'E", "BF", "RER", "WR", NA,
NA, "t", "h:mm:ss.ms", "0:00:25.000", "0:00:26.000", "0:00:27.000",
"0:00:28.000", "0:00:29.000", "0:00:30.000"), ...2 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Unit",
"L/min", "L/min", "ml/min/kg", "ml", "/min", NA, NA, "L/min",
"/min", NA, "W", NA, NA, "Phase", NA, "Rest", "Rest", "Rest",
"Rest", "Rest", "Rest"), ...3 = c(NA, NA, NA, NA, NA, NA, NA,
NA, "343", NA, "GFRex", "343", NA, "female", "21/05/1924", NA,
NA, NA, "178 cm", "88.2 kg", "Blue, medium", NA, NA, "0.96",
"28", NA, "2.06 m2", NA, NA, NA, NA, NA, NA, NA, NA, NA, "12/04/2021 11:27 AM",
"0:15:12", "MetaLyzer 3B-R3", "231821624", "1.3.10", NA, "21.5°C",
"1030mBar", "36%", NA, NA, NA, "Rest", "0.36", "0.31", "4", "0",
"-", "35.7", "40.0", "14.9", "14", "0.88", "0", NA, NA, "Marker",
NA, NA, NA, NA, NA, NA, NA), ...4 = c(NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Unloaded Pedalling",
"-", "-", "-", "-", "-", "-", "-", "-", "-", "-", "-", NA, NA,
"V'O2", "L/min", "0.61123179277253403", "0.61123179277253403",
"0.61123179277253403", "0.61123179277253403", "0.51731964113453299",
"0.51731964113453299"), ...5 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, "Warm Up", "0.61", "0.47",
"7", "0", "-", "26.2", "33.9", "18.5", "16", "0.77", "0", NA,
NA, "V'O2/kg", "ml/min/kg", "6.9339965147196203", "6.9339965147196203",
"6.9339965147196203", "6.9339965147196203", "5.8686289408341796",
"5.8686289408341796"), ...6 = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, "VT1", "1.22", "0.98", "14",
"-", "-", "28.6", "32.3", "35.4", "22", "0.88", "71", NA, NA,
"V'O2/HR", "ml", "0", "0", "0", "0", "0", "0"), ...7 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "VT1 % Norm",
"145", "-", "145", "-", "-", "-", "-", "-", "102", "-", "131",
NA, NA, "HR", "/min", NA, NA, NA, NA, NA, NA), ...8 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "VT1 % Max",
"69", "55", "69", "-", "-", "83", "96", "54", "76", "87", "55",
NA, NA, "WR", "W", "0", "0", "0", "0", "0", "0"), ...9 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "VT2",
"1.71", "1.66", "19", "-", "-", "31.9", "32.7", "60.0", "32",
"0.97", "122", NA, NA, "V'E/V'O2", NA, "30.6521809263484", "30.6521809263484",
"30.6521809263484", "30.6521809263484", "34.760039405568897",
"34.760039405568897"), ...10 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, "VT2 % Norm", "203", "-",
"203", "-", "-", "-", "-", "-", "147", "-", "226", NA, NA, "V'E/V'CO2",
NA, "35.697970640705897", "35.697970640705897", "35.697970640705897",
"35.697970640705897", "39.618822090063901", "39.618822090063901"
), ...11 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, "VT2 % Max", "97", "93", "97", "-", "-", "93",
"97", "92", "110", "96", "96", NA, NA, "RER", NA, "0.858653317715381",
"0.858653317715381", "0.858653317715381", "0.858653317715381",
"0.87736175817015105", "0.87736175817015105"), ...12 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "V'O2peak",
"1.77", "1.79", "20", "0", "-", "34.3", "33.8", "65.4", "29",
"1.01", "128", NA, NA, "V'E", "L/min", "23.334937499999999",
"23.334937499999999", "23.334937499999999", "23.334937499999999",
"21.762284444444401", "21.762284444444401"), ...13 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "V'O2peak % Norm",
"210", "-", "210", "0", "-", "-", "-", "-", "135", "-", "237",
NA, NA, "VT", "L", "0.86250000000000004", "0.86250000000000004",
"0.86250000000000004", "0.86250000000000004", "0.97866666666666702",
"0.97866666666666702"), ...14 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, "Normal", "0.84", "-", "10",
"8", "104", "-", "-", "-", "22", "-", "54", NA, NA, "BF", "/min",
"27.055", "27.055", "27.055", "27.055", "22.2366666666667", "22.2366666666667"
), ...15 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, "Absolute Maximum Values", "2.06", "1.96", "23",
"0", "-", "44.5", "37.2", "73.5", "34", "1.19", "128", NA, NA,
"V'CO2", "L/min", "0.52483620675725695", "0.52483620675725695",
"0.52483620675725695", "0.52483620675725695", "0.45387646988174501",
"0.45387646988174501"), ...16 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, "WR", "W", "0", "0", "0", "0", "0",
"0")), row.names = c(NA, -70L), class = c("tbl_df", "tbl", "data.frame"
))
I want to remove all rows before ...1 == "t". I'm importing multiple files and want to do this to all of them at the sametime, and the header "t" appears at a different row number in each file.
I have tried
df1 = df[-c(1:row_number(df$...1 =="t")),]
df1 = df[-c(rownames(df[df$...1 =="t",])),]
I'd like a base R or dplyr solution. Thanks
In dplyr, the slice function can be used to select rows by index, and the base-R which() can tell you which row index to start at.
df %>%
slice(min(which(...1 == 't')):n())
This code will check for any rows on which ...1 == 't', then which() tells you the row index. min() is in case you get a file with two rows of 't'. Then, the slice picks all rows from that row you just found to the end (n()).
in base R you could do:
df[-seq(which(df[, '...1'] == 't') - 1),]
I have a table that has for each survey site and survey date, a total of the number organisms counted, and measurements for each organism found. I would like to make sure that the data is correct by making sure the total organism counted match the total number of measurements taken.
I initially tried to gather the table, changed the values to 1 or 0 if a measurement was taken, and then group_by and summarise. This method didnt work, and I am sure there is a nicer method so any help would be appreciated.
Ideally I would like a table that has site, survey data, total counts and a count column derived from summing the number of measurements taken. The idea would be that the two count columns should have the same values, and hence not be missing data.
Sample data -
structure(list(Date.of.Survey = c("12/04/2022", "16/04/2022",
"12/04/2022", "13/04/2022", "14/04/2022", "15/04/2022"), Location = c("Wandle - Merton Abbey Mills",
"Wandle - Merton Abbey Mills", "Medway - Allington Weir", "Medway - Allington Weir",
"Medway - Allington Weir", "Medway - Allington Weir"), Was.the.trap.working.when.you.checked.it. = c("Yes",
"Yes", "Yes", "Yes", "Yes", "Yes"), Number.of.eels = c(0L, 1L,
0L, 0L, 0L, 20L), X1..Length..mm. = c("", "180", "", "", "",
"72"), X2..Length..mm. = c("", "", "", "", "", "69"), X3..Length..mm. = c("",
"", "", "", "", "76"), X4..Length..mm. = c("", "", "", "", "",
"72"), X5..Length..mm. = c("", "", "", "", "", "72"), X6..Length..mm. = c("",
"", "", "", "", "73"), X7..Length..mm. = c(NA, NA, NA, NA, NA,
77L), X8..Length..mm. = c(NA, NA, NA, NA, NA, 78L), X9..Length..mm. = c(NA,
NA, NA, NA, NA, 75L), X10..Length..mm. = c(NA, NA, NA, NA, NA,
72L), X11..Length..mm. = c(NA, NA, NA, NA, NA, 75L), X12..Length..mm. = c(NA,
NA, NA, NA, NA, 78L), X13..Length..mm. = c(NA, NA, NA, NA, NA,
74L), X14..Length..mm. = c(NA, NA, NA, NA, NA, 70L), X15..Length..mm. = c(NA,
NA, NA, NA, NA, 75L), X16..Length..mm. = c(NA, NA, NA, NA, NA,
75L), X17..Length..mm. = c(NA, NA, NA, NA, NA, 73L), X18..Length..mm. = c(NA,
NA, NA, NA, NA, 72L), X19..Length..mm. = c(NA, NA, NA, NA, NA,
75L), X20..Length..mm. = c(NA, NA, NA, NA, NA, 71L), X21..Length..mm. = c(NA,
NA, NA, NA, NA, NA), X22..Length..mm. = c(NA, NA, NA, NA, NA,
NA), X23..Length..mm. = c(NA, NA, NA, NA, NA, NA), X24..Length..mm. = c(NA,
NA, NA, NA, NA, NA), X25..Length..mm. = c(NA, NA, NA, NA, NA,
NA), X26..Length..mm. = c(NA, NA, NA, NA, NA, NA), X27..Length..mm. = c(NA,
NA, NA, NA, NA, NA), X28..Length..mm. = c(NA, NA, NA, NA, NA,
NA), X29..Length..mm. = c(NA, NA, NA, NA, NA, NA), X30..Length..mm. = c(NA,
NA, NA, NA, NA, NA), X31..Length..mm. = c(NA, NA, NA, NA, NA,
NA), X32..Length..mm. = c(NA, NA, NA, NA, NA, NA), X33..Length..mm. = c(NA,
NA, NA, NA, NA, NA), X34..Length..mm. = c(NA, NA, NA, NA, NA,
NA), X35..Length..mm. = c(NA, NA, NA, NA, NA, NA), X36..Length..mm. = c(NA,
NA, NA, NA, NA, NA), X37..Length..mm. = c(NA, NA, NA, NA, NA,
NA), X38..Length..mm. = c(NA, NA, NA, NA, NA, NA), X39..Length..mm. = c(NA,
NA, NA, NA, NA, NA), X40..Length..mm. = c(NA, NA, NA, NA, NA,
NA), X41..Length..mm. = c(NA, NA, NA, NA, NA, NA), X42..Length..mm. = c(NA,
NA, NA, NA, NA, NA), X43..Length..mm. = c(NA, NA, NA, NA, NA,
NA), X44..Length..mm. = c(NA, NA, NA, NA, NA, NA), X45..Length..mm. = c(NA,
NA, NA, NA, NA, NA), X46..Length..mm. = c(NA, NA, NA, NA, NA,
NA), X47..Length..mm. = c(NA, NA, NA, NA, NA, NA), X48..Length..mm. = c(NA,
NA, NA, NA, NA, NA), X49..Length..mm. = c(NA, NA, NA, NA, NA,
NA), X50..Length..mm. = c(NA, NA, NA, NA, NA, NA)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))```
Thanks in advance
You want to first make sure that your blanks are NAs. Then you can use rowSums to count the number of non-NA columns, and finally use case_when to create a variable to identify whether the count matches the number of measurements. I also recommend using janitor's clean_names function to make it a little easier to work with your variable names.
library(dplyr)
library(janitor)
df <- df %>%
mutate_all(na_if,"") %>%
mutate(count = rowSums(!is.na(select(., 5:50)))) %>%
mutate(count_match = case_when(number_of_eels == count ~1,
TRUE ~0))
I have a dataset that capture a list of data's variables. It looks like this:
It can be build using codes:
df<-structure(list(cxr.CSV = c("project", "Subject", "Site", "InstanceName",
"RecordPosition", "CXRDT", "CXRFIND", "CXRFNDSP", "CXRYN", NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), cy1.CSV = c("project",
"Subject", "Site", "InstanceName", "RecordPosition", "CYSHPYN",
"CYSHPDT", "CY1TMPT", "CYND", "CYNDSP", "CYDT", "CYTM", NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA), cy2.CSV = c("project", "Subject",
"Site", "InstanceName", "RecordPosition", "CYSHPYN", "CYSHPDT",
"CY2TMPT", "CYND", "CYNDSP", "CYDT", "CYTM", NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), cy24.CSV = c("project", "Subject", "Site",
"InstanceName", "RecordPosition", "CYSHPYN", "CYSHPDT", "CY1TMPT",
"CYND", "CYNDSP", "CYDT", "CYTM", NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA), cy3.CSV = c("project", "Subject", "Site", "InstanceName",
"RecordPosition", "CYSHPYN", "CYSHPDT", "CY3TMPT", "CYND", "CYNDSP",
"CYDT", "CYTM", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), cy6.CSV = c("project",
"Subject", "Site", "InstanceName", "RecordPosition", "CYSHPYN",
"CYSHPDT", "CY1TMPT", "CYND", "CYNDSP", "CYDT", "CYTM", NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA), dlt.CSV = c("project", "Subject",
"Site", "InstanceName", "RecordPosition", "DLTYN", "DLTAE", "DLTSP",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), dm.CSV = c("project",
"Subject", "Site", "InstanceName", "RecordPosition", "BRTHYR",
"DMAGE", "SEX", "SEXSP", "FEMCBP", "FEMCBPSP", "RACE", "RACESP",
"ETHNIC", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), dov.CSV = c("project",
"Subject", "Site", "InstanceName", "RecordPosition", "DOVDT",
"DOVAE", "DOVCM", "DOVCP", NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA), dov_1.CSV = c("project", "Subject", "Site", "InstanceName",
"RecordPosition", "DOVDT", NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), ds.CSV = c("project", "Subject", "Site",
"InstanceName", "RecordPosition", "DSDT", "DSREAS", "DSORTH",
"DSWCSP", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
ds_1.CSV = c("project", "Subject", "Site", "InstanceName",
"RecordPosition", "DSDT", "DSREAS", "DSWCSP", "DSORTH", NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), dth.CSV = c("project",
"Subject", "Site", "InstanceName", "RecordPosition", "DTHFCDT",
"DTHDT", "DTHDUR", "DTHREAS", "DTHROTH", "DTHCOMM", NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA), dv.CSV = c("project",
"Subject", "Site", "InstanceName", "RecordPosition", "DVYN",
"DVVIS", "DVIDDAT", "DVSTDAT", "DVENDAT", "DVCAT", "DVCATSP",
"DVCATCD", "DVTERM", "REWFLAG", "REWCOMP", "DVACN", "DVMETRPT",
"DVCLSDAT", "DVCLS", NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), tegu.CSV = c("project",
"Subject", "Site", "InstanceName", "RecordPosition", "EGYN",
"EGDT", "EGNOU", "EGTM", "EGORRES", "EGHR", "EGPR", "EGQRS",
"EGQTINT", "ECGRR", "EGQTCFC", "EGQTCBC", "EGQTCNS", "EGQTCO",
"EGQTCOSP", "EGRSAB01", "EGRSAB02", "EGRSAB03", "EGRSAB04",
"EGRSAB05", "EGRSAB06", "EGRSAB07", "EGRSAB08", "EGRSAB09",
"EGRSAB10", "EGRSAB11", "EGRSAB12", "EGRSAB13", "EGABNCOM",
"EGABNCS", "EGTMPT", "EGND"), tegu_1.CSV = c("project", "Subject",
"Site", "InstanceName", "RecordPosition", "EGYN", "EGNOU",
"EGND", "EGTMPT", "EGDT", "EGTM", "EGORRES", "EGHR", "EGPR",
"EGQRS", "EGQTINT", "ECGRR", "EGQTCFC", "EGQTCBC", "EGQTCNS",
"EGQTCO", "EGQTCOSP", "EGRSAB01", "EGRSAB02", "EGRSAB03",
"EGRSAB04", "EGRSAB05", "EGRSAB06", "EGRSAB07", "EGRSAB08",
"EGRSAB09", "EGRSAB10", "EGRSAB11", "EGRSAB12", "EGRSAB13",
"EGABNCOM", "EGABNCS")), row.names = c(NA, -37L), class = c("tbl_df",
"tbl", "data.frame"))
I want to compare each column. If two data set the variables are same or one is completed included in another one. then mark them with same number. In the end, I would like to get a summary tables that looks like this:
No need to be exactly same so long as it catch the info. the tricky part are: tegu.CSV and tegu_1.CSV, ds.CSV and ds_1.CSV have same variable list in different order, dov.CSV has every variable that dov_1.CSV have and more. They need to be in the same group.
How can I achieve this goal?
Additional step: what if I only want the dataset have same variable in a group? in that case, dov and dov1 will be in separated group?
Here is one solution, although not nice it might help you:
library(purrr)
my_data <- df %>%
map(~.x[!is.na(.x)])
mySetDiff <- function(a, b) map2(a, b, setdiff)
my_data <- my_data %>%
outer(., ., mySetDiff) %>%
apply(1, function(x) colnames(df)[which(map_dbl(x, length) == 0)]) %>%
.[order(map_dbl(., length), decreasing = TRUE)]
i <- 1
my_list <- list()
repeat{
if(length(my_data) == 0) break
my_list[[i]] <- my_data[my_data[[1]]] %>%
unlist() %>%
unique()
my_data <- my_data[-which(names(my_data) %in% my_data[[1]])]
i <- i + 1
}
my_list %>%
imap(~tibble(Data = .x, Group = .y)) %>%
bind_rows()
just note that cy2.csv and cy3.csv have CY2TMPT/CY3TMPT so they should not be in same group as cy1.csv, cy6.csv, cy24.csv
I am struggling with cellwise-calculations in a complex data-set (see below for dput() example).
I need to apply the formula for standardized mean difference (M1-M2/sqrt(s1^2+s2^2) to multiple rows and columns (studies and tests). The M1 & M2 (means) values are in the pr_cognm_ columns and s1 and s2 (standard deviations) in the pr_cognsd_ columns, and they should be calculated dependent on the factors id & tx...3 (treatments).
So e.g. for the pr_cognm_VV2_CRT_error column the id 336 has two rows and, in this case!, the values in the treat1 row need to be subtracted from VGT. But sometimes it is the other way around, e.g. within id 162 dGT needs to be subtracted from treat1 (luckily, this logic will be the same for each specific comparison though). Then, the same thing needs to happen with the standard deviations (i.e. potentiate and addition). Lastly, M and S need to be divided. There are many tests (columns) to run the formula on (e.g.pr_cognm_BVMT_perc_retention, pr_cognm_VV2_CRT_error, etc.) and often NA, since the specific id did not have this test. The data is in the long format and, to make it more complicated, some id have three instead of two rows (where two rows need to be subtracted from one other in a specific direction e.g. task1).
My best idea was to
#make a dataset
a <- readxl::read_excel("C:/.../reprod.xlsx")
b<- a[!grepl("com", a$id),] #already omitted in example dataset
pr_cognm <- dplyr::select(b,contains("pr_cognm"))
pr_cognsd <- dplyr::select(b,contains("pr_cognsd"))
c <- cbind(b$tx...3, b$id ,pr_cognm, pr_cognsd)
c$`b$id` <- as.factor(c$`b$id`)
#turn var's into numerics and factors
#potentiate all standard deviations (s1^2 and s2^2)
c[,3:ncol(c)] <- sapply(c[,3:ncol(c)], as.numeric)
c[,grepl("pr_cognsd", colnames(c))] <- c[,grepl("pr_cognsd", colnames(c))]^2
#then reshape
require(reshape2)
c %>%
dcast(b$id ~ b$tx...3, value.var = c("pr_cognm_VV2_CRT_error"), fill = 0)
b$id BF BL BT dGT H-TT HFL LM-TT treat1 VGT
1 55 0 0 0 0 0 0.00 0.00 0.00 0.00
2 162 0 0 0 0 0 0.00 0.00 0.00 0.00
3 236 0 0 0 0 0 0.00 0.00 0.00 0.00
4 336 0 0 0 0 0 0.00 0.00 8.75 7.58
5 377 0 0 0 0 0 0.00 0.00 0.00 0.00
6 521 0 0 0 0 0 0.00 0.00 0.00 0.00
7 525 0 0 0 0 0 0.00 0.00 0.00 0.00
8 527 0 0 0 0 0 0.00 0.00 0.00 0.00
9 528 0 0 0 0 0 0.00 0.00 0.00 0.00
10 535 0 0 0 0 0 5.65 6.54 0.00 0.00
11 548 0 0 0 0 0 0.00 0.00 0.00 0.00
12 553 0 0 0 0 0 0.00 0.00 0.00 0.00
Now I could define the rules which variables should be added like c$sub <- c$treat1-c$VGT and c$sub <- c$HFL-c$LM-TT, do the addition with the SD's in a similar fashion and finally divide the two variables to find the SMD. BUT, this only works for one test. In this case value.var = c("pr_cognm_VV2_CRT_error"). I would like to get this matrix for every test I have in the dataset via e.g. loop since more than one value.var dont work:
require(reshape2)
c %>%
dcast(b$id ~ b$tx...3, value.var = c("pr_cognm_VV2_CRT_error", "pr_cognm_BNT_perc_retention"), fill = 0)
Error in .subset2(x, i, exact = exact) : subscript out of bounds
In addition: Warning message:
In if (!(value.var %in% names(data))) { :
the condition has length > 1 and only the first element will be used
If there were a way to loop via
c %>%
+ dcast(b$id ~ b$tx...3, value.var = c(names(c[,3:ncol(c)]), fill = 0)
then I could maybe rbind them and do the subtractions into a new variable as described above and after doing the same for the SD's I would finally be able to do the division to get the SMD.
I could not get any solutions to work.
Reprod example (truncated):
a <- structure(list(checked = c("Y", "Y", NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), id = c("55", "55", "162", "162", "236", "236", "336", "336",
"377", "377", "521", "521", "525", "525", "527", "527", "528",
"528", "535", "535", "548", "548", "548", "553", "553"), tx...3 = c("task1",
"VGT", "dGT", "task1", "BT", "H-TT", "task1", "VGT", "BT", "H-TT",
"task1", "VGT", "HFL", "H-TT", "BF", "BT", "HFL", "task1", "HFL",
"LM-TT", "HFL", "BL", "task1", "HFL", "task1"), nta = c(2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3,
2, 2), id2 = c(1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
1, 2, 1, 2, 1, 2, 3, 1, 2), cross = c("N", "N", "N", "N", "N",
"N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N",
"N", "N", "N", "N", "N", "N", "N"), pre_post = c("N", "N", "N",
"N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N",
"N", "N", "N", "N", "N", "N", "N", "N", "N"), case_control = c("N",
"N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N",
"N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N"), expsy = c("Y",
"Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y",
"Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y"), hosp = c("Out",
"Out", "Out", "Out", "In", "In", "NA", "NA", "In", "In", "NR",
"NR", "NR", "NR", "Mx", "Mx", "NR", "NR", "Mx", "Mx", "Out",
"Out", "Out", "Out", "Out"), tx...11 = c("task1", "VGT", "dGT",
"task1", "BT", "H-TT", "task1", "VGT", "BT", "H-TT", "task1",
"VGT", "HFL", "H-TT", "BF", "BT", "HFL", "task1", "HFL", "LM-TT",
"HFL", "BL", "task1", "HFL", "task1"), vt_p = c("17", "17", "24",
"24", "21", "21", "NR", "NR", "NR", "NR", "NA", "NA", "17", "17",
"24", "24", "17", "17", "17", "17", "17", "17", "17", "17", "17"
), n_se = c("12", "12", "20", "20", "6", "6", "10", "10", "NR",
"NR", "20", "20", "10", "6", "8", "8", "15", "15", "10", "6",
"15", "15", "15", "10", "10"), cogn_name_AMI_K = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), cogn_cite_AMI_K = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), cogn_last_stim_AMI_K = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), n_bcogn_AMI_K = c(NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA), n_pcogn_AMI_K = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), pr_cognm_AMI_K = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
pr_cognsd_AMI_K = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), po_cognm_n_AMI_K = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), po_cognsd_n_AMI_K = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA), cogn_name_BVMT_perc_retention = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, "brief_visual_memory_test", "brief_visual_memory_test",
"brief_visual_memory_test", NA, NA), cogn_cite_BVMT_perc_retention = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, "benedict_1997", "benedict_1997", "benedict_1997",
NA, NA), cogn_last_stim_BVMT_perc_retention = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, "NR", "NR", "NR", NA, NA), n_bcogn_BVMT_perc_retention = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 19, 24, 18, NA, NA), n_pcogn_BVMT_perc_retention = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 19, 24, 18, NA, NA), pr_cognm_BVMT_perc_retention = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, "91.6", "86.6", "90", NA, NA), pr_cognsd_BVMT_perc_retention = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, "17", "36", "13.3", NA, NA), po_cognm_n_BVMT_perc_retention = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, "86.7", "82.1", "71.7", NA, NA), po_cognsd_n_BVMT_perc_retention = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, "20", "39.3", "24.2", NA, NA), cogn_name_BNT_naming = c(NA,
NA, NA, NA, NA, NA, NA, NA, "boston_naming_task_naming",
"boston_naming_task_naming", NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), cogn_cite_BNT_naming = c(NA,
NA, NA, NA, NA, NA, NA, NA, "kaplan_1983", "kaplan_1983",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), cogn_last_stim_BNT_naming = c(NA, NA, NA, NA, NA, NA,
NA, NA, 30, 30, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), n_bcogn_BNT_naming = c(NA, NA, NA, NA, NA,
NA, NA, NA, 14, 14, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA), n_pcogn_BNT_naming = c(NA, NA, NA, NA,
NA, NA, NA, NA, 14, 14, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), pr_cognm_BNT_naming = c(NA, NA,
NA, NA, NA, NA, NA, NA, "19.64", "18.14", NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), pr_cognsd_BNT_naming = c(NA,
NA, NA, NA, NA, NA, NA, NA, "9.15", "5.3", NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), po_cognm_n_BNT_naming = c(NA,
NA, NA, NA, NA, NA, NA, NA, "20.21", "20.71", NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), po_cognsd_n_BNT_naming = c(NA,
NA, NA, NA, NA, NA, NA, NA, "9.38", "6.34", NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), cogn_name_VV2_CRT_error = c(NA,
NA, NA, NA, NA, NA, "VV2_crt_error", "VV2_crt_error", NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, "VV2_crt_error", "VV2_crt_error",
NA, NA, NA, NA, NA), cogn_cite_VV2_CRT_error = c(NA, NA,
NA, NA, NA, NA, "robbins_1994", "robbins_1994", NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, "robbins_1994", "robbins_1994",
NA, NA, NA, NA, NA), cogn_last_stim_VV2_CRT_error = c(NA,
NA, NA, NA, NA, NA, "NR", "NR", NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, "NR", "NR", NA, NA, NA, NA, NA), n_bcogn_VV2_CRT_error = c(NA,
NA, NA, NA, NA, NA, 12, 12, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 12, 12, NA, NA, NA, NA, NA), n_pcogn_VV2_CRT_error = c(NA,
NA, NA, NA, NA, NA, 12, 12, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 12, 12, NA, NA, NA, NA, NA), pr_cognm_VV2_CRT_error = c(NA,
NA, NA, NA, NA, NA, "8.75", "7.58", NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, "5.65", "6.54", NA, NA, NA, NA, NA), pr_cognsd_VV2_CRT_error = c(NA,
NA, NA, NA, NA, NA, "1.13", "2.84", NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, "1.10", "1.89", NA, NA, NA, NA, NA), po_cognm_n_VV2_CRT_error = c(NA,
NA, NA, NA, NA, NA, "7.50", "5.33", NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, "7.50", "2.34", NA, NA, NA, NA, NA), po_cognsd_n_VV2_CRT_error = c(NA,
NA, NA, NA, NA, NA, "2.06", "2.42", NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, "2.06", "2", NA, NA, NA, NA, NA), cogn_name_VV2_CRT_latency = c(NA,
NA, NA, NA, NA, NA, "VV2_crt_latency", "VV2_crt_latency",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA), cogn_cite_VV2_CRT_latency = c(NA, NA, NA, NA, NA,
NA, "robbins_1994", "robbins_1994", NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), cogn_last_stim_VV2_CRT_latency = c(NA,
NA, NA, NA, NA, NA, "NR", "NR", NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), n_bcogn_VV2_CRT_latency = c(NA,
NA, NA, NA, NA, NA, 12, 12, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA), n_pcogn_VV2_CRT_latency = c(NA,
NA, NA, NA, NA, NA, 12, 12, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA), pr_cognm_VV2_CRT_latency = c(NA,
NA, NA, NA, NA, NA, "476.05", "465.65", NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), pr_cognsd_VV2_CRT_latency = c(NA,
NA, NA, NA, NA, NA, "35.86", "37.54", NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), po_cognm_n_VV2_CRT_latency = c(NA,
NA, NA, NA, NA, NA, "460.66", "433.13", NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), po_cognsd_n_VV2_CRT_latency = c(NA,
NA, NA, NA, NA, NA, "34.75", "46.70", NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA,
-25L), class = c("tbl_df", "tbl", "data.frame"))
I have a data set consisting out of a matrix with quite some NAs. From this, I want to create a dataframe storing both the location and the value of the non-NA values. Via this answer which can be used to get the locations, via tempList <- which(!is.na(dummy),TRUE).
Currently I use a for loop afterwards. Is there a better way to add the values?
Data:
structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, "#000000FF", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, "#000000FF", NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Dim = c(10L,
10L))
Desired result:
structure(list(x = c(8, 7), y = c(5, 7), colour = structure(c(1L,
1L), .Label = "#000000FF", class = "factor")), class = "data.frame", row.names = c(NA,
-2L))
Current code:
tempList <- which(!is.na(dummy),TRUE)
changedDF <- data.frame(tempList[,1],tempList[,2])
names(changedDF) <- c("row","column")
for(i in 1:nrow(changedDF)){
changedDF$colour[i] <- dummy[changedDF[i,1],changedDF[i,2]]
}