Issues with pivot_wider and unique identifiers because of duplicate values - r

I'm trying to use pivot_wider move my dataset from long to wide so I can use it in a different programme.
I have seen the other posts on this topic but the solutions don't address my problem.
I have measurement variable called "rating" which has a value for each "rock" and each test ("gentest", first and second). I have an id variable called "turkcode".
For each individual in the dataset, there are 18 ratings. The problem is that there are 4 ratings for rock #8 and I think this is why the data won't pivot wider the way I want them to.
Here's a subset of the data
structure(list(turkcode = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L), .Label = c("100879",
"104655", "108505", "110324", "110600", "112445", "114083", "115814",
"116573", "117411", "117817", "118651", "119324", "121548", "121883",
"121918", "123275", "123718", "125491", "127450", "127825", "128062",
"129061", "131404", "135358", "135594", "135671", "135945", "137951",
"138675", "139469", "140924", "145730", "147222", "148533", "150851",
"153455", "158882", "164468", "166907", "169260", "171463", "172398",
"175565", "177108", "179000", "180270", "183953", "185574", "185880",
"185948", "186371", "187787", "189220", "190014", "192550", "193904",
"195308", "196755", "197493", "198368", "200155", "200297", "201915",
"214519", "215994", "217903", "218771", "219302", "220434", "222740",
"223223", "224721", "225118", "225223", "229856", "229874", "231301",
"232576", "233842", "234215", "237581", "239567", "240609", "241098",
"241423", "242108", "244633", "246055", "251597", "252929", "255252",
"256652", "259936", "274962", "277053", "279422", "280317", "282602",
"283750", "285737", "286259", "287544", "288507", "290503", "291401",
"291835", "292160", "294117", "297863", "298061", "299347", "299499",
"301399", "304875", "305231", "306312", "307410", "308979", "311157",
"311524", "311630", "318956", "318988", "319995", "321405", "324288",
"327086", "327559", "328345", "328401", "330318", "330909", "332723",
"334115", "334517", "335811", "335831", "337145", "338323", "338542",
"338575", "340083", "341182", "343612", "343947", "344554", "346476",
"349874", "350117", "350433", "350972", "351187", "355311", "356717",
"359366", "360048", "360058", "361191", "361971", "362827", "363543",
"367244", "374254", "374965", "376278", "377622", "382139", "382916",
"384586", "385229", "386782", "388951", "389029", "390299", "390662",
"396335", "396732", "398076", "398573", "399276", "399587", "403388",
"406073", "406160", "411977", "412935", "417350", "420060", "421393",
"422944", "424462", "427143", "429291", "430758", "431629", "431638",
"431935", "432218", "433788", "434291", "436681", "437087", "439385",
"439499", "440477", "440834", "441253", "441876", "443826", "444080",
"447597", "452643", "454649", "457055", "457946", "463512", "464079",
"464123", "467897", "468650", "470211", "471115", "471512", "475493",
"476937", "479198", "482871", "484066", "484070", "485462", "486402",
"491701", "491835", "499644", "501833", "502335", "502373", "504800",
"507439", "507946", "507987", "509066", "513078", "515519", "517017",
"517988", "519144", "519210", "519858", "522847", "523683", "525315",
"528577", "532463", "532630", "533028", "539033", "539852", "540690",
"546773", "546916", "549652", "551599", "554198", "556066", "559920",
"560804", "560857", "562080", "562420", "563841", "565668", "565776",
"566509", "569039", "572553", "575364", "576421", "576694", "576877",
"577120", "577155", "577534", "577605", "578463", "578820", "578995",
"580213", "581893", "582433", "582905", "583887", "584569", "585314",
"585566", "587393", "589144", "592284", "594463", "596863", "601837",
"602632", "604254", "605885", "609296", "609963", "610062", "612437",
"612949", "613161", "614372", "614777", "615372", "615384", "616927",
"618118", "620041", "620336", "621634", "622289", "624098", "626163",
"626612", "627019", "627856", "630003", "630255", "634018", "634478",
"635801", "638606", "640012", "641078", "641366", "641436", "641821",
"642076", "642446", "643329", "643942", "644015", "646792", "647254",
"647700", "649516", "650792", "650810", "651229", "652387", "652671",
"654778", "657964", "658894", "660500", "660607", "664469", "666754",
"666796", "668996", "669712", "671682", "673516", "675712", "677835",
"678008", "679262", "680295", "686455", "690471", "691175", "692489",
"694023", "696001", "698716", "700133", "700641", "707812", "707953",
"708010", "708881", "713657", "715255", "715386", "716764", "718936",
"719956", "725348", "727753", "728436", "729588", "730513", "731928",
"732013", "732438", "733366", "733559", "734672", "735174", "735675",
"737044", "737127", "741264", "745262", "748173", "748414", "748943",
"749221", "749963", "750363", "753518", "754512", "754970", "758639",
"760838", "761642", "766250", "770646", "772574", "773054", "775271",
"776762", "778208", "779453", "781378", "781861", "782257", "785763",
"785860", "787011", "790280", "791735", "791903", "792178", "796650",
"796822", "796970", "798621", "802731", "804701", "805606", "807848",
"809142", "810539", "812182", "812321", "814029", "814545", "814774",
"815079", "816572", "824215", "825063", "827763", "829973", "829983",
"830126", "832112", "832666", "833066", "834756", "835270", "835340",
"837413", "837746", "839882", "846097", "847975", "848746", "851745",
"851975", "856622", "858918", "859174", "859182", "859726", "859850",
"862222", "864356", "865028", "869700", "871576", "872256", "873350",
"873597", "875873", "883140", "886308", "886592", "886706", "892144",
"893930", "894959", "896820", "900374", "901373", "902879", "904147",
"905194", "906305", "908049", "908798", "911505", "913314", "915390",
"915833", "919057", "922432", "924120", "925640", "927671", "932006",
"936810", "936916", "938349", "940727", "941945", "942271", "943188",
"944548", "945783", "947164", "948322", "949181", "951414", "952632",
"955090", "956428", "956985", "959916", "960349", "962224", "962980",
"964665", "967160", "967588", "969929", "972543", "972893", "977734",
"978083", "978981", "980427", "980782", "981541", "981850", "982220",
"983781", "985193", "986366", "988934", "989056", "991218", "991914",
"995411", "995630", "995873", "995936", "996309"), class = "factor"),
aid = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("mem",
"noMem"), class = "factor"), gentest = structure(c(1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L,
2L, 1L, 2L), .Label = c("first", "second"), class = "factor"),
rocks = structure(c(1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L,
6L, 6L, 7L, 7L, 8L, 8L, 8L, 8L, 1L, 1L), .Label = c("R1",
"R2", "R3", "R4", "R5", "R6", "R7", "R8"), class = "factor"),
rating = c(7L, 5L, 2L, 7L, 4L, 2L, 6L, 3L, 3L, 2L, 3L, 3L,
2L, 1L, 3L, 6L, 3L, 2L, 2L, 4L), condition = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L), .Label = c("baseline", "category", "property"
), class = "factor"), order = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("after", "before", "none"), class = "factor")), row.names = c(NA,
-20L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), groups = structure(list(
turkcode = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L), .Label = c("100879",
"104655", "108505", "110324", "110600", "112445", "114083",
"115814", "116573", "117411", "117817", "118651", "119324",
"121548", "121883", "121918", "123275", "123718", "125491",
"127450", "127825", "128062", "129061", "131404", "135358",
"135594", "135671", "135945", "137951", "138675", "139469",
"140924", "145730", "147222", "148533", "150851", "153455",
"158882", "164468", "166907", "169260", "171463", "172398",
"175565", "177108", "179000", "180270", "183953", "185574",
"185880", "185948", "186371", "187787", "189220", "190014",
"192550", "193904", "195308", "196755", "197493", "198368",
"200155", "200297", "201915", "214519", "215994", "217903",
"218771", "219302", "220434", "222740", "223223", "224721",
"225118", "225223", "229856", "229874", "231301", "232576",
"233842", "234215", "237581", "239567", "240609", "241098",
"241423", "242108", "244633", "246055", "251597", "252929",
"255252", "256652", "259936", "274962", "277053", "279422",
"280317", "282602", "283750", "285737", "286259", "287544",
"288507", "290503", "291401", "291835", "292160", "294117",
"297863", "298061", "299347", "299499", "301399", "304875",
"305231", "306312", "307410", "308979", "311157", "311524",
"311630", "318956", "318988", "319995", "321405", "324288",
"327086", "327559", "328345", "328401", "330318", "330909",
"332723", "334115", "334517", "335811", "335831", "337145",
"338323", "338542", "338575", "340083", "341182", "343612",
"343947", "344554", "346476", "349874", "350117", "350433",
"350972", "351187", "355311", "356717", "359366", "360048",
"360058", "361191", "361971", "362827", "363543", "367244",
"374254", "374965", "376278", "377622", "382139", "382916",
"384586", "385229", "386782", "388951", "389029", "390299",
"390662", "396335", "396732", "398076", "398573", "399276",
"399587", "403388", "406073", "406160", "411977", "412935",
"417350", "420060", "421393", "422944", "424462", "427143",
"429291", "430758", "431629", "431638", "431935", "432218",
"433788", "434291", "436681", "437087", "439385", "439499",
"440477", "440834", "441253", "441876", "443826", "444080",
"447597", "452643", "454649", "457055", "457946", "463512",
"464079", "464123", "467897", "468650", "470211", "471115",
"471512", "475493", "476937", "479198", "482871", "484066",
"484070", "485462", "486402", "491701", "491835", "499644",
"501833", "502335", "502373", "504800", "507439", "507946",
"507987", "509066", "513078", "515519", "517017", "517988",
"519144", "519210", "519858", "522847", "523683", "525315",
"528577", "532463", "532630", "533028", "539033", "539852",
"540690", "546773", "546916", "549652", "551599", "554198",
"556066", "559920", "560804", "560857", "562080", "562420",
"563841", "565668", "565776", "566509", "569039", "572553",
"575364", "576421", "576694", "576877", "577120", "577155",
"577534", "577605", "578463", "578820", "578995", "580213",
"581893", "582433", "582905", "583887", "584569", "585314",
"585566", "587393", "589144", "592284", "594463", "596863",
"601837", "602632", "604254", "605885", "609296", "609963",
"610062", "612437", "612949", "613161", "614372", "614777",
"615372", "615384", "616927", "618118", "620041", "620336",
"621634", "622289", "624098", "626163", "626612", "627019",
"627856", "630003", "630255", "634018", "634478", "635801",
"638606", "640012", "641078", "641366", "641436", "641821",
"642076", "642446", "643329", "643942", "644015", "646792",
"647254", "647700", "649516", "650792", "650810", "651229",
"652387", "652671", "654778", "657964", "658894", "660500",
"660607", "664469", "666754", "666796", "668996", "669712",
"671682", "673516", "675712", "677835", "678008", "679262",
"680295", "686455", "690471", "691175", "692489", "694023",
"696001", "698716", "700133", "700641", "707812", "707953",
"708010", "708881", "713657", "715255", "715386", "716764",
"718936", "719956", "725348", "727753", "728436", "729588",
"730513", "731928", "732013", "732438", "733366", "733559",
"734672", "735174", "735675", "737044", "737127", "741264",
"745262", "748173", "748414", "748943", "749221", "749963",
"750363", "753518", "754512", "754970", "758639", "760838",
"761642", "766250", "770646", "772574", "773054", "775271",
"776762", "778208", "779453", "781378", "781861", "782257",
"785763", "785860", "787011", "790280", "791735", "791903",
"792178", "796650", "796822", "796970", "798621", "802731",
"804701", "805606", "807848", "809142", "810539", "812182",
"812321", "814029", "814545", "814774", "815079", "816572",
"824215", "825063", "827763", "829973", "829983", "830126",
"832112", "832666", "833066", "834756", "835270", "835340",
"837413", "837746", "839882", "846097", "847975", "848746",
"851745", "851975", "856622", "858918", "859174", "859182",
"859726", "859850", "862222", "864356", "865028", "869700",
"871576", "872256", "873350", "873597", "875873", "883140",
"886308", "886592", "886706", "892144", "893930", "894959",
"896820", "900374", "901373", "902879", "904147", "905194",
"906305", "908049", "908798", "911505", "913314", "915390",
"915833", "919057", "922432", "924120", "925640", "927671",
"932006", "936810", "936916", "938349", "940727", "941945",
"942271", "943188", "944548", "945783", "947164", "948322",
"949181", "951414", "952632", "955090", "956428", "956985",
"959916", "960349", "962224", "962980", "964665", "967160",
"967588", "969929", "972543", "972893", "977734", "978083",
"978981", "980427", "980782", "981541", "981850", "982220",
"983781", "985193", "986366", "988934", "989056", "991218",
"991914", "995411", "995630", "995873", "995936", "996309"
), class = "factor"), rocks = structure(c(1L, 1L, 2L, 2L,
3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L, 1L, 1L), .Label = c("R1",
"R2", "R3", "R4", "R5", "R6", "R7", "R8"), class = "factor"),
gentest = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("first",
"second"), class = "factor"), .rows = list(1L, 2L, 3L, 4L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15:16, 17:18,
19L, 20L)), row.names = c(NA, -18L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE))
Does anyone know how I can modify the second set of ratings for rock #8 so that I can pivot the data wider or even exclude this data from the dataset altogether?
EDIT:
Here is an example of how I'd like the output to look
id <- rep("100879", times = 6)
aid <- rep("mem", times = 6)
test <- rep(c("first", "second"), times = 3)
order <- rep("after", times = 6)
condition <- rep ("cat", times = 6)
R1 <- sample(0:9, 6, replace=T)
R2 <- sample(0:9, 6, replace=T)
R3 <- sample(0:9, 6, replace=T)
R4 <- sample(0:9, 6, replace=T)
R5 <- sample(0:9, 6, replace=T)
R6 <- sample(0:9, 6, replace=T)
R7 <- sample(0:9, 6, replace=T)
R8 <- sample(0:9, 6, replace=T)
df <- cbind(id, aid, test, order, condition, R1, R2, R3, R4, R5, R6, R7, R8)

a data.table suggestion
library( data.table )
#set data as data.table
setDT( mydata )
#create rowid by group
mydata[, row_id := rowidv( mydata, cols = c("turkcode", "aid", "gentest", "condition", "order", "rocks") ) ]
#create new rocks-column to group on
mydata[, rocks2 := paste0( rocks, ifelse( row_id == 1, "", paste0("_",row_id ) ) ) ]
#now cast to wide
dcast( mydata, turkcode + aid + gentest + condition + order ~ rocks2, value.var = "rating" )
# turkcode aid gentest condition order R1 R2 R3 R4 R5 R6 R7 R8 R8_2
# 1: 100879 mem first category after 7 2 4 6 3 3 2 3 6
# 2: 100879 mem second category after 5 7 2 3 2 3 1 3 2
# 3: 104655 mem first category after 2 NA NA NA NA NA NA NA NA
# 4: 104655 mem second category after 4 NA NA NA NA NA NA NA NA

Another option using pivot_wider and separate
library(dplyr)
library(tidyr)
#short version, but you will end up with R1-R8 in list foramt
df %>%
pivot_wider(id_cols = c("turkcode", "aid", "gentest", "condition", "order"),
names_from = "rocks", values_from = "rating", values_fn = list(rating = list))
#clean version
df %>%
#id_cols: A set of columns that uniquely identifies each observation.
#Defaults to all columns in data except for the columns specified in names_from and values_from.
pivot_wider(id_cols = c("turkcode", "aid", "gentest", "condition", "order"),
names_from = "rocks",
values_from = "rating",
values_fn = list(rating = ~paste(., collapse = ","))
#values_fn = list(rating = mean)
#,values_fill = list(rating=0)
) %>%
separate(R8, into = c('R8','R8_1'))
# A tibble: 4 x 14
# Groups: turkcode, gentest [1,118]
turkcode aid gentest condition order R1 R2 R3 R4 R5 R6 R7 R8 R8_1
<fct> <fct> <fct> <fct> <fct> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 100879 mem first category after 7 2 4 6 3 3 2 3 6
2 100879 mem second category after 5 7 2 3 2 3 1 3 2
3 104655 mem first category after 2 NA NA NA NA NA NA NA NA
4 104655 mem second category after 4 NA NA NA NA NA NA NA NA

Related

count number of times string appears in a column

Can you think about an intuitive way of calculating the number of times the word space appears in a certain column? Or any other solution that is viable.
I basically want to know how many times the space key was pressed, however some participants made the mistake and pressed other keys which would also be considered a mistake. So I was wondering if I should go with the "key_resp.rt" column instead and count the number of response times instead. If you had any idea of how to do both it would be great as I may need to use both.
I used the following code but the results do not conform to the data.
Data %>% group_by(Participant, Session) %>% summarise(false_start = sum(str_count(key_resp.keys, "space")))
Here is a snippet of my data:
Participant RT Session key_resp.keys key_resp.rt
X 0.431265 1 ["space"] [2.3173399999941466]
X 0.217685 1
X 0.317435 2 ["space","space"] [0.6671900000001187,2.032510000000002] 2020.1.3 4
Y 0.252515 1
Y 0.05127 2 ["space","space","space","space","space","space","space","space","space"] [4.917419999999765,6.151149999999689,6.333714999999771,6.638249999999971,6.833514999999338,7.0362499999992,7.217724999999504,7.38576999999988,7.66913999999997]
dput(droplevels(head(Data_PVT)))
structure(list(Interval_stimulus = c(4.157783411, 4.876139922,
5.67011868, 9.338167417, 9.196342656, 7.62448411), Participant = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "ADH80254", class = "factor"),
RT = c(431.265, 277.99, 253.515, 310.53, 299.165, 539.46),
Session = c(1L, 1L, 1L, 1L, 1L, 1L), date = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "2020-06-12_11h11.47.141", class = "factor"),
key_resp.keys = structure(c(2L, 1L, 1L, 1L, 1L, 1L), .Label = c("",
"[\"space\"]"), class = "factor"), key_resp.rt = structure(c(2L,
1L, 1L, 1L, 1L, 1L), .Label = c("", "[2.3173399999941466]"
), class = "factor"), psychopyVersion = structure(c(1L, 1L,
1L, 1L, 1L, 1L), .Label = "2020.1.3", class = "factor"),
Trials = 0:5, Reciprocal = c(2.31875992719094, 3.59725169970143,
3.94453977082224, 3.22030077609249, 3.3426370063343, 1.85370555740926
)), row.names = c(NA, 6L), class = "data.frame")
Expected output:
Participant Session false_start
x 1 0
x 2 1
y 1 2
y 2 1
z 1 10
z 2 3
We can use str_count to count "space" values for each Participant and Session and sum them to get total. For all_false_start we count number of words in it.
library(dplyr)
library(stringr)
df %>%
group_by(Participant, Session) %>%
summarise(false_start = sum(str_count(key_resp.keys, '\\bspace\\b')),
all_false_start = sum(str_count(key_resp.keys, '\\b\\w+\\b')))

Creating row in dataframe for each element in vector

I have a vector of numbers:
a <- c(54, 456, 23432, 4868, 34, 245634, 37, 46453, 1342354)
In my already-existent dataframe (head included via dput below), I would like to create a new variable. Each row of the new variable will contain a single element from the vector. So there would be one value (e.g. 54) in each row of the new variable.
structure(list(Phone = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "a", class = "factor"), Frame = structure(c(1L,
3L, 2L, 4L, 6L, 5L), .Label = c("[-4.46225397 -4.14727267 -4.45203785 -4.67251549 -5.13750066 -4.92839463\n -5.03957588 -5.68530479]",
"[-6.14532579 -4.38918589 -4.12275354 -4.19263549 -4.30380823 -4.35621995\n -4.4079389 -4.47339504]",
"[-6.43104195 -4.75506178 -4.2324676 -4.21878988 -4.1635973 -4.11186806\n -4.05023489 -4.08204198]",
"[-7.1528423 -5.46190925 -5.94873845 -6.635839 -6.84179002 -6.85955335\n -6.83714326 -6.87621415]",
"[-7.23901353 -4.61522546 -3.25206619 -3.38407075 -3.63762837 -3.85352927\n -3.94250123 -4.04015791]",
"[-7.34451319 -5.58664694 -4.69929752 -4.621823 -4.51670576 -4.48494125\n -4.39512713 -4.26553646]"
), class = "factor"), Previous = structure(c(1L, 1L, 1L, 1L,
1L, 1L), .Label = "ch", class = "factor"), Following = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "p", class = "factor"), Word = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = "juk'ucha-pi", class = "factor"),
Note = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "", class = "factor"),
"[-10.79197258 -7.97949955 -7.10253093 -7.07957825 -6.98695923\n -6.90015207 -6.79672506 -6.85010073",
"[-10.31251047 -7.36552088 -6.91841906 -7.0356884 -7.2222481\n -7.31020053 -7.39699043 -7.5068328 ",
"[-12.00323036 -9.16566481 -9.982616 -11.13564383 -11.48125155\n -11.51106031 -11.47345379 -11.5390189 ",
"[-12.32487451 -9.37498793 -7.8859212 -7.7559107 -7.5795128\n -7.52620857 -7.37549093 -7.15802398",
"[-12.14783486 -7.74483933 -5.45731306 -5.67883075 -6.10432742\n -6.46663209 -6.61593651 -6.77981481"
), Morph_status = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "", class = "factor"),
row.names = c(NA, 6L), class = "data.frame")
When working with data frames, each variable (column) has as many entries as there are rows. What you are describing then is not a data frame and, if I understand you question correctly, the best your can do is going back to general lists:
df <- data.frame(a = 1:3, b = 1:3)
c(as.list(df), c = list(a))
# $a
# [1] 1 2 3
#
# $b
# [1] 1 2 3
#
# $c
# [1] 54 456 23432 4868 34 245634 37 46453 1342354
One other option, as to still have a data frame, would be to fill all the shorter columns with NA's:
library(rowr)
cbind.fill(df, a, fill = NA)
# a b object
# 1 1 1 54
# 2 2 2 456
# 3 3 3 23432
# 4 NA NA 4868
# 5 NA NA 34
# 6 NA NA 245634
# 7 NA NA 37
# 8 NA NA 46453
# 9 NA NA 1342354

Extract only significant rows from TukeyHSD output

After generating a very large TukeyHSD table, I want to only see the rows that are <0.05 in the adj.p.value column. I have tried the IF and ifelse functions, but they only produce a TRUE/FALSE table. I want to see the whole data row for the significant comparisons. Thanks!
Data
structure(list(Species = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L), .Label = c("A", "B", "C", "D"), class = "factor"),
TREATMENT = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 4L,
4L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 1L, 1L, 1L,
2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 1L, 1L, 1L, 2L, 2L, 3L, 3L,
3L, 4L, 4L), .Label = c("1", "2", "3", "4"), class = "factor"),
`CLASS 1` = c(9.3993303987076, 8.14588087483834, 8.44889021858093,
28.2773809415175, 9.49156649568952, 37.406663111623, 8.42458221212166,
35.529904738035, 33.1401135085753, 8.26977630375797, 7.87786527094827,
7.83020300515061, 35.1465417538538, 10.5560853720815, 7.54702433773332,
7.15030081390987, 7.73624654623485, 33.6461639540039, 10.3098164094602,
7.79017325570062, 8.47473750173462, 8.37179798600773, 31.7364310355766,
2.00147496567679, 9.30194886619568, 7.87886829173726, 7.93445694220837,
9.10020522660375, 8.81542855137005, 7.83313314713951, 7.84449591023115,
23.6150030864875, 9.3452854347794, 8.91047098149179, 7.76031738257155,
9.79467065201063, 24.7592334362831, 8.54842834366722, 7.60436112798701,
8.93480758329653, 8.72406315335014, 9.49850179222777), `CLASS 2` = c(10.8069912074565,
4.52426389123869, 8.13120921128287, 52.3870196313339, 7.17369219206721,
45.7325224336886, 17.8345921677786, 38.4579761235057, 37.5916934855387,
6.28803058195647, 3.72788988807285, 4.64744990904241, 29.7689968962103,
7.08515103144071, 6.44277647222835, 5.71017728280462, 6.28290843412007,
45.6123170472575, 6.98431855663527, 8.03809625184267, 4.76656440828616,
6.74640254081232, 31.3243238213156, 45.1287867136276, 7.6308508343969,
4.0127554151831, 9.11910102221636, 6.1658394708941, 10.4617259648364,
6.07502685224869, 8.08373642262043, 48.588633863193, 10.2160085507338,
7.52606530219909, 5.66373884014351, 8.51992766801391, 25.9109062123364,
5.74498954209992, 5.56377323143979, 7.76698847227212, 7.05016373786876,
7.99745310894107), `CLASS 3` = c(3.96856956332584, 2.76052305637364,
6.92715392916015, 0.687821057043984, 2.30154255462355, 2.61089063893911,
11.2199145273738, 10.7058533354417, 1.90691767773411, 3.93488282297868,
1.7034110083142, 1.69310511636903, 1.54005861925764, 50.436990190291,
3.93233520754151, 4.06684782901502, 6.10592204678281, 0.675086986967025,
3.94018776658881, 5.74129993338595, 2.02845185559621, 4.10963382465756,
14.9264019576272, 12.9672579626868, 5.1049208042632, 1.37282635713804,
3.00088572108344, 4.78878116348504, 4.79564218319094, 3.03836532949481,
3.48474205480686, 1.09218910757234, 6.2830307568812, 3.06784943090836,
4.89376208853059, 6.6321148581705, 1.01356027363186, 3.15439940439419,
40.8141653079423, 2.52825000616702, 1.65382018138259, 1.81173455682492
), `CLASS 4` = c(0, 13.4274810838142, 10.9876140536356, 3.15424686759082,
15.2632739415738, 0, 0, 2.39525969535064, 4.19386122886851,
13.15599261724, 14.5421891905919, 14.4542067660843, 0, 0,
12.2276086827261, 12.7527880016103, 12.1436697242409, 3.79216208516423,
0, 12.2283190622827, 16.0271803699645, 14.035876401479, 4.24556176551009,
0, 0, 14.4993393432366, 13.6722412691012, 0, 14.0027443968931,
13.7579074961889, 12.9935353616471, 4.66128854387559, 0,
15.1941922851023, 11.6990009190362, 0, 7.99399142573694,
14.5041748372822, 0, 15.0674109079436, 13.7134908002476,
15.4194201146961), `CLASS 5` = c(7.82638584740367, 6.56112678542475,
6.95253086439919, 2.06445951884762, 7.17086660532553, 2.58627258328855,
7.83400556063298, 1.77053879587063, 2.65292759651742, 6.94701807830366,
6.85309102458439, 6.71505104532983, 4.06818278652755, 5.79906266122279,
6.62064468061089, 6.88365856613044, 7.68403751285005, 2.38479005191691,
9.07405520739349, 7.65785587918449, 7.4385885335047, 7.30144390122309,
2.41680929257195, 4.18258704279641, 7.8906816661241, 6.75678558060943,
6.67150537517493, 8.24794113296791, 7.67443442992891, 6.89357008866252,
6.45444668132533, 2.98342694785768, 8.704729108357, 7.14382850099481,
7.15233553294014, 9.14001781571836, 3.98831954045444, 7.54093786042356,
5.79029360470226, 6.82793163574773, 6.48049736162586, 7.18554914992982
), `CLASS 6` = c(20.6189597026452, 22.8728557858066, 23.0767150659042,
4.99832103176657, 24.1726463550235, 5.56104550736533, 31.8124013284184,
4.31653191057476, 8.4695331411828, 20.63468068931, 19.7369752322083,
19.6902616040991, 11.6648564225744, 0, 25.2321582223958,
22.2981543181678, 29.3198455372777, 5.88723409877159, 30.1474816315191,
28.0835788057802, 24.0430626320328, 25.1446564854412, 5.78713327050339,
30.6155806819949, 23.8853696442419, 20.1783872969561, 17.5935515655693,
23.4169038776536, 21.1986239116884, 19.6931330316831, 22.2658181144794,
7.38944654414811, 32.1897387187698, 23.6398829158785, 25.3561697324352,
30.4118856020653, 12.6822088903071, 26.300118251779, 11.7338836812169,
23.8624555097246, 20.1037712460599, 21.8478004507985), `CLASS 7` = c(15.9129851563051,
15.2250454288061, 13.5577123002506, 2.9902563940573, 15.4408266617369,
2.67511425705514, 8.17164465017573, 2.23047357314211, 4.01010767344732,
13.4046459481448, 15.3008244637288, 15.3885729336047, 7.81496654756214,
17.8194559247092, 12.7823202355514, 13.7684066964868, 15.378473991847,
3.75026919344972, 18.2880822635935, 14.7412162942703, 17.5270089738067,
16.799718650752, 4.33839497916674, 2.21937177530762, 15.0315149187176,
15.3632530721031, 14.1580725482114, 16.4215442147509, 15.5113323256627,
14.3349000132624, 12.8504657216928, 5.06281347160092, 15.5075336560533,
15.9392345541138, 13.3981839319596, 16.6700105346756, 8.10398633871805,
15.958090408468, 16.5733149488757, 15.1802203155931, 14.2236219296677,
16.2095182295187), `CLASS 8` = c(19.9174685533413, 16.6755018156139,
13.9892072522183, 3.35339208579287, 18.98558519396, 3.42749146804023,
10.4801793890691, 2.97802997775506, 5.11270635117451, 17.0372757040089,
18.7865491767228, 18.3992789502607, 9.99639697401416, 0,
15.9270550696003, 17.1615519869107, 15.3488962066467, 4.25197658246908,
21.2560581648095, 15.7194605175531, 19.6944057250743, 17.4904702096271,
5.22494387772846, 2.88494085790995, 19.1038328534942, 19.0183655117756,
17.533290326259, 19.92632149392, 17.5400682364295, 17.664926273487,
16.3075864395099, 6.6071984352649, 17.7536737744256, 18.5784760293114,
14.706720581834, 18.8313728693457, 9.73353207739478, 18.2488613518859,
8.53356517614357, 19.8319355692553, 17.4801581342745, 20.0300225970631
), `CLASS 9` = c(11.5493095708147, 9.80732127808386, 7.92896710456816,
2.08710247204941, 0, 0, 4.22268016442976, 1.61543185032431,
2.92213933696131, 10.3276972542995, 11.4712047448286, 11.1818706700593,
0, 8.30325482025479, 9.28807709161222, 10.2081145049644,
0, 0, 0, 0, 0, 0, 0, 0, 12.0508804125665, 10.9194191312608,
10.316895230176, 11.9324634197247, 0, 10.709037767833, 9.7151732936871,
0, 0, 0, 9.36977099054923, 0, 5.81426180513736, 0, 3.38664292169246,
0, 10.5704134555229, 0), `CLASS 10` = c(44.7938508721352,
51.7310046920715, 57.5715824785637, 89.5047895292528, 58.4027215389776,
91.3111216916161, 69.2914902356924, 91.4055258029079, 85.3021190418994,
52.2833630152431, 47.5883305901355, 48.3152264007455, 78.1204536918961,
68.0782265938132, 55.3819029226251, 51.9782682455077, 61.5885922886562,
89.6129641721643, 51.3818043642034, 61.8814673089921, 55.3399967676143,
58.4083672383978, 88.0198518505328, 90.713100323986, 45.9230901490977,
47.942176704251, 51.3202365201787, 43.4717297386365, 59.2741650079789,
50.3975658567551, 54.6723278637849, 85.3465611452765, 58.0340634611641,
58.33846091558, 55.372988962717, 55.3585987802603, 72.3599002382954,
58.2521103792226, 65.716183348586, 58.1599124794039, 51.2453091189091,
56.5749100234884), `CLASS 11` = c(55.2061491278648, 48.2689953079285,
42.4284175214362, 10.4952104707472, 41.5972784610224, 8.68887830838393,
30.7085097643076, 8.59447419709211, 14.6978809581006, 47.7166369847569,
52.4116694098645, 51.6847735992545, 21.8795463081039, 31.9217734061868,
44.6180970773749, 48.0217317544923, 38.4114077113438, 10.3870358278357,
48.6181956357966, 38.1185326910079, 44.6600032323857, 41.5916327616022,
11.9801481494672, 9.28689967601398, 54.0769098509023, 52.0578232957489,
48.6797634798213, 56.5282702613635, 40.7258349920211, 49.6024341432449,
45.3276721362151, 14.6534388547235, 41.9659365388359, 41.66153908442,
44.627011037283, 44.6414012197397, 27.6400997617046, 41.7478896207774,
34.283816651414, 41.8400875205961, 48.7546908810909, 43.4250899765116
), `CLASS 12` = c(0.811392418775427, 1.07172325344784, 1.35691090645737,
8.52815575054215, 1.40400342762093, 10.5089654211764, 2.25642633809048,
10.6353831202186, 5.80370185913679, 1.09570511081795, 0.907972043744494,
0.934805805194479, 3.57047868323309, 2.13265803649301, 1.24124305047309,
1.08239054166649, 1.60339326148851, 8.62738568129464, 1.05684309531167,
1.62339583767845, 1.23914000811097, 1.40432975000493, 7.34714218491929,
9.76785617252635, 0.849218090969217, 0.920940862853288, 1.05424169822542,
0.769026356858985, 1.45544382379371, 1.01603009463636, 1.20615785649631,
5.82433666195463, 1.38288498357373, 1.40029538508808, 1.24079537651438,
1.24007305478085, 2.61793194894868, 1.3953306600253, 1.91682810629766,
1.39005236188319, 1.05108468934595, 1.30281618424025)), row.names = c(NA,
-42L), .Names = c("Species", "TREATMENT", "CLASS 1", "CLASS 2",
"CLASS 3", "CLASS 4", "CLASS 5", "CLASS 6", "CLASS 7", "CLASS 8",
"CLASS 9", "CLASS 10", "CLASS 11", "CLASS 12"), class = c("tbl_df",
"tbl", "data.frame"))
library(Reshape2)
library(dplyr)
library(broom)
melt <- melt(example, id=c("TREATMENT","Species"), value.name="Percentage", variable.name = "Class")
res = melt %>% group_by(Species,Class) %>%
do(Model = TukeyHSD(aov(Percentage ~ TREATMENT, data=.)))
as.data.frame(tidy(res,Model))
You can select the rows of interest using, e.g., filter from dplyr. In filter you give a logical statement that indicates which rows you want. In your case, you want all rows where adj.p.value is less than .05.
as.data.frame(tidy(res,Model)) %>%
filter(adj.p.value < .05)
Species Class comparison estimate conf.low conf.high adj.p.value
1 A CLASS 3 3-1 6.410802 1.668508 11.153096 0.013375263
2 A CLASS 3 3-2 9.096133 4.353839 13.838426 0.002303284
3 A CLASS 3 4-3 -8.041984 -13.236906 -2.847061 0.006931852
4 A CLASS 9 2-1 -9.066165 -16.389642 -1.742688 0.020094567

Combining dataframe rows based on a value in a range [duplicate]

This question already has an answer here:
Comparing multiple columns in different data sets to find values within range R
(1 answer)
Closed 8 years ago.
I'm trying to bring together (it's not really a merge or join) data contained in two dataframes based on whether a value in one falls within a range on the second.
data is at the end of the post for convenience.
One data frame (df1) looks like this:
Chromosome Position P.value start.range end.range name
2 4553493 8.23e-05 4453493 4653493 A
3 24548810 1.04e-04 24448810 24648810 B
1 9952003 2.09e-04 9852003 10052003 C
The second df is much longer, but head(df2) looks like this:
ensembl_gene_id chromosome_name start_position end_position
OS01G0281600 1 10048273 10050309
OS01G0281400 1 10021423 10027120
OS01G0281301 1 10019633 10020376
OS01G0281200 1 10011875 10015468
OS01G0281100 1 10008075 10011595
OS01G0281000 1 10003952 10007742
I need to match the rows from each IF df1$Position is within 100,000 of either df2$start_position or df2$end_position (ie ((df1$Position - df2$start_position)<100000 | (df1$Position - df2$end_position)<100000).
I need, as output, a list or dataframe of the rows that match. There will be multiple df2 values that match df1, and there are multiple entries per chromosome, though df1$name is unique. I've been trying various applications of ddply and custom functions, but am coming up short. Any ideas?
data:
df1 <- structure(list(Chromosome = c(2L, 3L, 1L), Position = c(4553493L,
24548810L, 9952003L), P.value = c(8.23e-05, 0.000104, 0.000209
), start.range = c(4453493, 24448810, 9852003), end.range = c(4653493,
24648810, 10052003), name = c("A", "B", "C")), .Names = c("Chromosome",
"Position", "P.value", "start.range", "end.range", "name"), class = "data.frame", row.names = c(NA,
3L))
df2 <- structure(list(ensembl_gene_id = c("OS01G0281600", "OS01G0281400",
"OS01G0281301", "OS01G0281200", "OS01G0281100", "OS01G0281000",
"OS01G0280500", "OS01G0280400", "OS01G0280000", "OS01G0279900",
"OS01G0279800", "OS01G0279700", "OS01G0279400", "OS01G0279300",
"OS01G0279200", "OS01G0279100", "OS01G0279000", "OS01G0278900",
"OS01G0278950", "OS02G0183000", "OS02G0182850", "OS02G0182900",
"OS02G0182700", "OS02G0182800", "OS02G0182500", "OS02G0182300",
"OS02G0181900", "OS02G0182100", "OS02G0181800", "OS02G0181400",
"OS02G0180900", "OS02G0180700", "OS02G0180500", "OS02G0180200",
"OS02G0180400", "OS02G0180100", "OS03G0640300", "OS03G0640400",
"OS03G0640000", "OS03G0640100", "OS03G0639700", "OS03G0639800",
"OS03G0639600", "OS03G0639400", "OS03G0639300", "OS03G0638900",
"OS03G0639100", "OS03G0638400", "OS03G0638800", "OS03G0638300",
"OS03G0638200"), chromosome_name = c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), start_position = c(10048273L,
10021423L, 10019633L, 10011875L, 10008075L, 10003952L, 9967185L,
9962807L, 9936850L, 9928971L, 9917593L, 9913390L, 9889550L, 9887657L,
9878384L, 9874379L, 9866730L, 9859354L, 9863216L, 4639932L, 4629617L,
4630446L, 4616832L, 4625425L, 4598883L, 4594375L, 4567630L, 4573831L,
4563073L, 4551426L, 4521670L, 4497115L, 4486531L, 4460342L, 4481872L,
4455016L, 24630180L, 24638186L, 24616417L, 24621460L, 24591421L,
24596843L, 24574540L, 24564913L, 24544511L, 24487877L, 24514494L,
24466606L, 24476060L, 24454477L, 24449135L), end_position = c(10050309L,
10027120L, 10020376L, 10015468L, 10011595L, 10007742L, 9969073L,
9966715L, 9947933L, 9935981L, 9921565L, 9917318L, 9902737L, 9889123L,
9885517L, 9876678L, 9870864L, 9860677L, 9866617L, 4641686L, 4630180L,
4634616L, 4621974L, 4628750L, 4601382L, 4595386L, 4573049L, 4578257L,
4566597L, 4552860L, 4523668L, 4500124L, 4489409L, 4463571L, 4483470L,
4457715L, 24634746L, 24641449L, 24617859L, 24629502L, 24596437L,
24600376L, 24579212L, 24565726L, 24549550L, 24489307L, 24515219L,
24473558L, 24480927L, 24457481L, 24453890L)), .Names = c("ensembl_gene_id",
"chromosome_name", "start_position", "end_position"), class = "data.frame", row.names = c(NA,
-51L))
Is this what you want?
ddply(df1, .(name), function(x) {
df2[(x$Position - df2$start_position) < 100000 |
(x$Position - df2$end_position) < 100000, ]
})

Checking row format of csv

I am trying to import some data (below) and checking to see if I have the appropriate number of rows for later analysis.
repexample <- structure(list(QueueName = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c(" Overall", "CCM4.usci_retention_eng", "usci_helpdesk"
), class = "factor"), X8Tile = structure(c(1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L), .Label = c(" Average", "1", "2", "3", "4", "5", "6", "7",
"8"), class = "factor"), Actual = c(508.1821504, 334.6994838,
404.9048759, 469.4068667, 489.2800416, 516.5744106, 551.7966176,
601.5103783, 720.9810622, 262.4622533, 250.2777778, 264.8281938,
272.2807882, 535.2466968, 278.25, 409.9285714, 511.6635101, 553,
641, 676.1111111, 778.5517241, 886.3666667), Calls = c(54948L,
6896L, 8831L, 7825L, 5768L, 7943L, 5796L, 8698L, 3191L, 1220L,
360L, 454L, 406L, 248L, 11L, 9L, 94L, 1L, 65L, 9L, 29L, 30L),
Pop = c(41L, 6L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 3L, 1L, 1L,
1L, 11L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L)), .Names = c("QueueName",
"X8Tile", "Actual", "Calls", "Pop"), class = "data.frame", row.names = c(NA,
-22L))
The data gives 5 columns and is one example of some data that I would typically import (via a .csv file). As you can see there are three unique values in the column "QueueName". For each unique value in "QueueName" I want to check that it has 9 rows, or the corresponding values in the column "X8Tile" ( Average, 1, 2, 3, 4, 5, 6, 7, 8). As an example the "QueueName" Overall has all of the necessary rows, but usci_helpdesk does not.
So my first priority is to at least identify if one of the unique values in "QueueName" does not have all of the necessary rows.
My second priority would be to remove all of the rows corresponding to a unique "QueueName" that does not meet the requirements.
Both these priorities are easily addressed using the Split-Apply-Combine paradigm, implemented in the plyr package.
Priority 1: Identify values of QueueName which don't have enough rows
require(plyr)
# Make a short table of the number of rows for each unique value of QueueName
rowSummary <- ddply(repexample, .(QueueName), summarise, numRows=length(QueueName))
print(rowSummary)
If you have lots of unique values of QueueName, you'll want to identify the values which are not equal to 9:
rowSummary[rowSummary$numRows !=9, ]
Priority 2: Eliminate rows for which QueueNamedoes not have enough rows
repexample2 <- ddply(repexample, .(QueueName), transform, numRows=length(QueueName))
repexampleEdit <- repexample2[repexample2$numRows ==9, ]
print(repxampleEdit)
(I don't quite understand the meaning of 'check that it has 9 rows, or the corresponding values in the column "X8Tile"). You could edit the repexampleEdit line based on your needs.
This is an approach that makes some assumptions about how your data are ordered. It can be modified (or your data can be reordered) if the assumption doesn't fit:
## Paste together the values from your "X8tile" column
## If all is in order, you should have "Average12345678"
## If anything is missing, you won't....
myMatch <- names(
which(with(repexample, tapply(X8Tile, QueueName, FUN=function(x)
gsub("^\\s+|\\s+$", "", paste(x, collapse = ""))))
== "Average12345678"))
## Use that to subset...
repexample[repexample$QueueName %in% myMatch, ]
# QueueName X8Tile Actual Calls Pop
# 1 Overall Average 508.1822 54948 41
# 2 Overall 1 334.6995 6896 6
# 3 Overall 2 404.9049 8831 5
# 4 Overall 3 469.4069 7825 5
# 5 Overall 4 489.2800 5768 5
# 6 Overall 5 516.5744 7943 5
# 7 Overall 6 551.7966 5796 5
# 8 Overall 7 601.5104 8698 5
# 9 Overall 8 720.9811 3191 5
# 14 CCM4.usci_retention_eng Average 535.2467 248 11
# 15 CCM4.usci_retention_eng 1 278.2500 11 2
# 16 CCM4.usci_retention_eng 2 409.9286 9 2
# 17 CCM4.usci_retention_eng 3 511.6635 94 2
# 18 CCM4.usci_retention_eng 4 553.0000 1 1
# 19 CCM4.usci_retention_eng 5 641.0000 65 1
# 20 CCM4.usci_retention_eng 6 676.1111 9 1
# 21 CCM4.usci_retention_eng 7 778.5517 29 1
# 22 CCM4.usci_retention_eng 8 886.3667 30 1
Similar approaches can be taken with aggregate+merge and similar tools.

Resources