R:Converting char to time (duration(hms)) - r

I'm attempting to change a column type from char to time and am not getting the results expected. Description and dput output follow:
I'm attempting to change the column called Duration from char to time:
structure(list(RouteID = c(12817402, 12817404, 12817406, 12817425,
12817426, 12817446, 12817447, 12817455, 12817481, 12817499, 12817599,
12817603, 12817631, 12817636, 12817655), PaymentPlan = c("Subscriber",
"Casual", "Subscriber", "Casual", "Casual", "Casual", "Subscriber",
"Subscriber", "Casual", "Casual", "Subscriber", "Casual", "Casual",
"Casual", "Casual"), StartHub = c("NW Johnson at Jamison Square",
"SE Ladd at Hawthorne", NA, "SE 50th at Clinton", NA, "SE 30th at Division",
"SW Morrison at 18th", "NE 42nd at Hancock", NA, NA, "SW River at Montgomery",
NA, "NW Flanders at 14th", NA, NA), StartLatitude = c(45.5286366,
45.5120818, 45.522783, 45.503506, 45.5086555, 45.50468892, 45.52196048,
45.536898, 45.5041753, 45.5014807, 45.50910258, 45.5233209, 45.52579919,
45.5315017, 45.5218677), StartLongitude = c(-122.6820195, -122.6533493,
-122.6811195, -122.611066, -122.6547299, -122.6345551, -122.6896772,
-122.619969, -122.6612413, -122.6557978, -122.6735169, -122.6963404,
-122.6855063, -122.6834541, -122.6747676), StartDate = c("12/1/2019",
"12/1/2019", "12/1/2019", "12/1/2019", "12/1/2019", "12/1/2019",
"12/1/2019", "12/1/2019", "12/1/2019", "12/1/2019", "12/1/2019",
"12/1/2019", "12/1/2019", "12/1/2019", "12/1/2019"), StartTime = structure(c(1020,
1140, 1200, 2700, 2880, 6660, 6960, 8280, 13020, 15900, 23040,
23340, 24780, 24840, 25500), class = c("hms", "difftime"), units = "secs"),
EndHub = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "NW Couch at 11th",
"SE 2nd Pl at Tilikum Way", "NW Raleigh at 21st", NA, "SW 5th at Morrison"
), EndLatitude = c(45.524531, 45.5086555, 45.5090834, 45.5422432,
45.5041753, 45.5034396, 45.5312952, 45.5317187, 45.5014807,
45.5041753, 45.52374151, 45.50624163, 45.53409115, 45.5144089,
45.51889487), EndLongitude = c(-122.6744613, -122.6547299,
-122.6840225, -122.604573, -122.6612413, -122.639666, -122.6946193,
-122.6306539, -122.6557978, -122.6612413, -122.6818129, -122.6633379,
-122.6949424, -122.6840143, -122.6774061), EndDate = c("12/1/2019",
"12/1/2019", "12/1/2019", "12/1/2019", "12/1/2019", "12/1/2019",
"12/1/2019", "12/1/2019", "12/1/2019", "12/1/2019", "12/1/2019",
"12/1/2019", "12/1/2019", "12/1/2019", "12/1/2019"), EndTime = structure(c(1740,
1440, 1860, 4200, 3780, 6840, 8220, 10560, 13380, 16500,
23880, 24420, 25140, 26100, 25800), class = c("hms", "difftime"
), units = "secs"), TripType = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), BikeID = c(6516, 24819, 7342, 6636, 24819,
7109, 6396, 6082, 24819, 24819, 6464, 7252, 6310, 6040, 6249
), BikeName = c("0060 BIKETOWN", "0199 BIKETOWN", "1016 LATINX HERITAGE MONTH",
"0825 BIKETOWN", "0199 BIKETOWN", "0749 BIKETOWN", "0707 BIKETOWN",
"0084 BIKETOWN", "0199 BIKETOWN", "0199 BIKETOWN", "0559 BIKETOWN",
"0845 BIKETOWN", "0868 BIKETOWN", "0901 BIKETOWN", "0300 BIKETOWN"
), Distance_Miles = c(0.72, 0.44, 1.18, 2.99, 1.36, 0.31,
0.89, 0.71, 0.58, 0.83, 1.5, 2.46, 1.02, 1.66, 0.34), Duration = c("0:11:10",
"0:05:18", "0:11:31", "0:24:28", "0:15:11", "0:03:11", "0:21:02",
"0:37:45", "0:05:38", "0:10:26", "0:13:33", "0:17:27", "0:06:42",
"0:20:58", "0:04:51"), RentalAccessPath = c("keypad", "mobile",
"mobile", "keypad", "mobile", "mobile", "keypad", "mobile",
"mobile", "mobile", "keypad", "keypad", "keypad", "keypad",
"keypad_rfid_card"), MultipleRental = c(FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE)), row.names = c(NA, -15L), class = c("tbl_df",
"tbl", "data.frame"))
I ran this:
library(lubridate)
X2020_07 <- as.duration(hms(X2020_07$Duration))
where X df is structured similar to the dput df above.
The type was changed from char to time but there is only 1 column!:
new("Duration", .Data = c(1687, 73, 1499, 691, 475, 350, 538,
3018, 1594, 2447, 1160, 1185, 469, 1090, 424))
The goal is to preserve the 19 column df when it's piped to another df of the same name , i.e. X2020_07 <- X2020 while changing the column type to time.
Given my limited knowledge of R (I am enjoying learning) unsure what else I should be looking into. Appreciate guidance and solutions!

Assuming as.duration results in the correct column type.
Here is a solution using the pipe operator of dplyr:
library(dplyr)
library(lubridate)
X2020_07 <- X2020 %>%
dplyr::mutate(Duration = lubridate::as.duration(lubridate::hms(Duration)))

Related

How do I change the shape of the lines in a plot generated by a for loop?

I'm not sure how to change the shape and color of the for loop for data in a df, fish
structure(list(Region = structure(c(7L, 7L, 7L, 7L, 7L), .Label = c("American Samoa",
"Johnston Atoll", "Line Islands", "MHI", "Musicians Seamounts",
"Northern Marianas", "NWHI", "Southern Marianas", "Tokelau Ridge",
"Wake Island"), class = "factor"), ObservationYear = c(2015,
2015, 2015, 2015, 2015), `Mega-Habitat` = c("bank", "bank", "tablemount",
"bank", "atoll"), Total_fish = c(6, 10, 21, 11, 7), Lat = c(23.2227305,
25.0840027, 26.8267143809524, 26.8188378, 27.5178584285714),
Long = c(-163.516748333333, -172.490419, -175.607307619048,
-176.315991, -175.460592857143), Temperature = c(1.82256666666667,
2.00518, 3.03555714285714, 2.01533, 1.5475), Salinity = c(34.64115,
34.61702, 34.4760619047619, 34.61106, 34.6673857142857),
Oxygen = c(3.16008333333333, 2.79735, 1.27077619047619, 2.58692,
3.73167142857143), Distance = c(350, 960, 1130, 360, 460),
`CTD Availability` = c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_), depth_bin = c("2000-3000",
"1000-2000", "1000-2000", "1000-2000", "2000-3000"), EventID = c("D2-EX1504L2-01",
"D2-EX1504L2-06", "D2-EX1504L2-08", "D2-EX1504L2-10", "D2-EX1504L2-12"
), Average_Depth = c(2160.20383333333, 1880.4596, 1217.94385,
1890.1868, 2780.92557142857), POC_Flux = c(2.56732258067581,
2.86961424536357, 3.38129564627503, 3.38129564627503, 3.80216410589398
)), row.names = c(NA, -5L), class = c("tbl_df", "tbl", "data.frame"
))
I ran a GAM before the loop:
g1 = mgcv::gam(Total_fish ~ s(Average_Depth, by = Region) + Region + offset(log(Distance)), data =fish,family= nb)
I tried defining shapes, but it did not work
shapes= c(0,1,2,3,4,5,6,7,8,9,10)
shapes <-shapes[as.numeric(fish$Region)]
colors.use = rainbow(nlevels(fish$Region))
for (i in 1:nlevels(fish$Region)) {
predictions = predict(g1, type="response", newdata = data.frame(Distance= 1000, Average_Depth = seq(0,3000,length=1000), Region = levels(fish$Region)[i]))
if (i == 1) plot(xlab= "Depth (m)", ylab = "fish/1000 m",seq(0,3000,length=1000), predictions, type = 'l', col=colors.use[i], pch=i)
if (i > 1) lines(seq(0,3000,length=1000), predictions,col=colors.use[i],pch=i)
}
I just need to be able to differentiate between the regions and the current rainbow colors alone are not very useful for that

Removing punctuation and all capitalization in newly generated columns (RStudio)

I am new to R, and while I do know some of the basics, I've been unable to figure out how to add new columns (preferably using the mutate() function) to a table which lack any punctuation or capitalization.
I exported around 20,000 observations from the citizen science network iNaturalist in an effort to determine which species are most commonly misidentified. To accomplish this, my goal is to have R compare the value for each observation in the species_guess column (which consists of variably punctuated and capitalized common and scientific names) to the corresponding name in either the taxon_species_name column (standardized, uniform scientific names) and the common_name column (which contains standardized, uniform common names). Every time the species_guess matches one of the latter two columns, I'd like to have either TRUE or FALSE printed in a new column: correct_identification.
I expect that accomplishing this would require the following:
the creation of three new columns which are the same as species_guess, taxon_species_name, and common_name but are all lowercase and have no punctuation.
the creation of a correct_identification column which reads TRUE or FALSE depending on whether the new species_guess matches taxon_species_name or common_name. I think I can do this step myself.
species_guess sample
Please don't hesitate to ask clarifying questions as needed. I am happy to provide more code samples. As requested, the output from the dput function (specifically using the code provided by #IRTFM) has been pasted at the bottom.
I found information on grep() and tolower(), but I really have no idea how to use them to create a new column. There's a lot on removing punctuation from a string, but I'm not sure how those methods would be applicable to an entire column in a dataset.
Thanks!
structure(list(id = c(99512L, 190432L, 207211L, 276566L, 298366L,
380464L), observed_on_string = c("Fri Jul 06 2012 14:35:33 GMT-0400 (EDT)",
"2009-09-19", "2012-06-13", "6/23/2010", "2013-06-13", "2013-08-27"
), observed_on = c("2012-07-06", "2009-09-19", "2012-06-13",
"2010-06-23", "2013-06-13", "2013-08-27"), time_observed_at = c("2012-07-06 18:35:33 UTC",
NA, NA, NA, NA, NA), time_zone = c("Eastern Time (US & Canada)",
"Eastern Time (US & Canada)", "Eastern Time (US & Canada)", "Eastern Time (US & Canada)",
"Eastern Time (US & Canada)", "Eastern Time (US & Canada)"),
user_id = c(2179L, 12610L, 13594L, 12035L, 12610L, 13406L
), user_login = c("charlie", "susanelliott", "bheitzman",
"sfaccio", "susanelliott", "hobiecat"), user_name = c("Charlie Hohn",
"Susan Elliott", "Bob Heitzman", "Steve Faccio", "Susan Elliott",
NA), created_at = c("2012-07-07 19:56:36 UTC", "2013-02-02 16:19:29 UTC",
"2013-03-01 02:00:25 UTC", "2013-05-23 19:32:44 UTC", "2013-06-13 18:57:38 UTC",
"2013-08-28 03:04:18 UTC"), updated_at = c("2019-01-08 21:22:48 UTC",
"2020-02-13 19:16:34 UTC", "2021-06-27 23:36:32 UTC", "2016-09-20 02:53:33 UTC",
"2017-09-26 01:21:35 UTC", "2020-02-12 01:23:48 UTC"), quality_grade = c("research",
"research", "research", "research", "research", "research"
), license = c("CC0", "CC-BY-NC", "CC-BY-NC", NA, "CC-BY-NC",
"CC-BY-NC"), url = c("http://www.inaturalist.org/observations/99512",
"http://www.inaturalist.org/observations/190432", "http://www.inaturalist.org/observations/207211",
"http://www.inaturalist.org/observations/276566", "http://www.inaturalist.org/observations/298366",
"http://www.inaturalist.org/observations/380464"), image_url = c("https://inaturalist-open-data.s3.amazonaws.com/photos/144232/medium.jpg",
"https://inaturalist-open-data.s3.amazonaws.com/photos/244969/medium.jpg",
"https://inaturalist-open-data.s3.amazonaws.com/photos/262914/medium.JPG",
"http://static.inaturalist.org/photos/342086/medium.JPG",
"https://inaturalist-open-data.s3.amazonaws.com/photos/369424/medium.jpg",
"https://inaturalist-open-data.s3.amazonaws.com/photos/475664/medium.jpg"
), sound_url = c(NA, NA, NA, NA, NA, NA), tag_list = c(NA,
"Spiranthes, ladies tresses, plant", "Spiranthes, lucida, orchid, Vermont",
NA, NA, NA), description = c(NA, NA, "S. lucida can be found in heavily scoured sections of the river banks, generally on the downstream side of boulders, where they are protected during floods. Very hardy, stout plants, with distinctive thick leaf whorls.\nFlower spikes are distinctive in mid-June, with 6-20 blossoms in a spiral.",
"Many blooming around pond edge.", NA, "Ladies' Tresses "
), num_identification_agreements = c(2L, 0L, 2L, 1L, 1L,
1L), num_identification_disagreements = c(0L, 0L, 0L, 0L,
0L, 0L), captive_cultivated = c("false", "false", "false",
"false", "false", "false"), oauth_application_id = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
), place_guess = c("United States", "Vermont, US", "Vermont, US",
"Vermont, US", "Vermont, US", "Grand Isle, VT"), latitude = c(43.6243306384,
44.7147801982, 43.6528495032, 43.9558655593, 43.8546044617,
44.75182), longitude = c(-73.2028825367, -71.933891759, -72.2231645845,
-72.5525452841, -73.1619811058, -73.30593), positional_accuracy = c(5L,
NA, NA, NA, 166L, NA), private_place_guess = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), private_latitude = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), private_longitude = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), public_positional_accuracy = c(27443L,
27285L, 27443L, 27396L, 27396L, NA), geoprivacy = c("obscured",
"obscured", "obscured", NA, NA, NA), taxon_geoprivacy = c("obscured",
NA, "obscured", "obscured", "obscured", NA), coordinates_obscured = c("true",
"true", "true", "true", "true", "false"), positioning_method = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), positioning_device = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), place_town_name = c(NA, NA, NA, NA, NA, "Grand Isle"),
place_county_name = c("Rutland", "Essex", "Windsor", "Orange",
"Addison", "Grand Isle"), place_state_name = c("Vermont",
"Vermont", "Vermont", "Vermont", "Vermont", "Vermont"), species_guess = c("Northern Slender Ladies'-tresses",
"Sphinx ladies’ tresses", "Spiranthes lucida", "Shining Ladies' Tresses",
"Shining Ladies' Tresses", "Sphinx ladies’ tresses"), scientific_name = c("Spiranthes lacera lacera",
"Spiranthes incurva", "Spiranthes lucida", "Spiranthes lucida",
"Spiranthes lucida", "Spiranthes incurva"), common_name = c("Northern Slender Ladies'-tresses",
"Sphinx ladies’ tresses", "Shining Ladies' Tresses", "Shining Ladies' Tresses",
"Shining Ladies' Tresses", "Sphinx ladies’ tresses"), iconic_taxon_name = c("Plantae",
"Plantae", "Plantae", "Plantae", "Plantae", "Plantae"), taxon_id = c(243059L,
773387L, 62254L, 62254L, 62254L, 773387L), taxon_subfamily_name = c("Orchidoideae",
"Orchidoideae", "Orchidoideae", "Orchidoideae", "Orchidoideae",
"Orchidoideae"), taxon_tribe_name = c("Cranichideae", "Cranichideae",
"Cranichideae", "Cranichideae", "Cranichideae", "Cranichideae"
), taxon_subtribe_name = c("Spiranthinae", "Spiranthinae",
"Spiranthinae", "Spiranthinae", "Spiranthinae", "Spiranthinae"
), taxon_genus_name = c("Spiranthes", "Spiranthes", "Spiranthes",
"Spiranthes", "Spiranthes", "Spiranthes"), taxon_species_name = c("Spiranthes lacera",
"Spiranthes incurva", "Spiranthes lucida", "Spiranthes lucida",
"Spiranthes lucida", "Spiranthes incurva"), taxon_hybrid_name = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), taxon_variety_name = c("Spiranthes lacera lacera",
NA, NA, NA, NA, NA)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
UPDATE: found a solution!
spiranthes<-spiranthes %>%
mutate(standardized_species_guess = gsub('[[:punct:] ]+',' ',tolower(species_guess)))
view(spiranthes)
Hopefully this helps anyone else who may be struggling with the same thing.

R:Duplicate column with NA values created with bind_rows

The combining of multiple df using bind_rows produced an unwanted duplicate column in the resulting df:
all_trips_raw <-
bind_rows(X2020_08, X2020_06, X2020_05,
X2020_04, X2020_03, X2020_02, X2020_01,
X2019_12_Dur, X2019_11, X2019_10, X2019_09)
Where the X df were the result of an import of 12 csv files that were uploaded into R studio cloud-both the csv files and resulting df's have 19 columns. The column in question is Distance_Miles. The column with data came from the separate data frames, and the new df has one with NA.
structure(list(RouteID = c(13442256, 13442257, 13442261, 13442275,
13442279), PaymentPlan = c("Casual", "Casual", "Subscriber",
"Subscriber", "Casual"), StartHub = c("SW Yamhill at Director Park",
"SW Yamhill at Director Park", NA, "NW Station at Irving", NA
), StartLatitude = c(45.51898132, 45.51898132, 45.5133558, 45.5282777,
45.5167987), StartLongitude = c(-122.6812685, -122.6812685, -122.6828884,
-122.6766282, -122.6729466), StartDate = c("8/1/2020", "8/1/2020",
"8/1/2020", "8/1/2020", "8/1/2020"), StartTime = structure(c(240,
300, 480, 1680, 2040), class = c("hms", "difftime"), units = "secs"),
EndHub = c("SW Yamhill at Director Park", "SW Yamhill at Director Park",
NA, NA, "SE Ladd at Hawthorne"), EndLatitude = c(45.51898132,
45.51898132, 45.5252069, 45.5266354, 45.5120818), EndLongitude = c(-122.6812685,
-122.6812685, -122.6765159, -122.6765624, -122.6533493),
EndDate = c("8/1/2020", "8/1/2020", "8/1/2020", "8/1/2020",
"8/1/2020"), EndTime = structure(c(2100, 2100, 1260, 1740,
2820), class = c("hms", "difftime"), units = "secs"), TripType = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), BikeID = c(5995, 6380, 7317, 6177, 6632), BikeName = c("0916 BIKETOWN",
"0694 BIKETOWN", "9890 ASCEND BIKE", "0367 PBOT BIKETOWN",
"0278 BIKETOWN"), Distance_Miles_ = c(1.85, 1.88, 1.05, 0.11,
1.27), Duration = structure(c(1837, 1771, 768, 110, 782), class = c("hms",
"difftime"), units = "secs"), RentalAccessPath = c("keypad",
"keypad", "keypad", "keypad", "mobile"), MultipleRental = c(FALSE,
FALSE, FALSE, FALSE, FALSE), Distance_Miles = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_)), row.names = c(NA,
-5L), class = c("tbl_df", "tbl", "data.frame"))
Would importing csv as a dyplr data_frame make a difference, when using bind_rows, instead of the base data.frame implementation?
Should the bind_rows statement been written differently, to prevent the duplicate column with NA values?
I also tried this to remove the added column:
# Find Duplicate Column Names
duplicated_names <- duplicated(colnames(my_df))
# Remove Duplicate Column Names
my_df[!duplicated_names]
where my_df was all_trips_raw

Trouble using filter and str_detect to remove participants

I am conducting food related study and would like to remove all of a participants data if they identified that they have any food related allergies during the questionnaire part of my study. I am trying to accomplish this using group_by, filter and str_detect.
Unfortunately, the code I have at the moment results in a new table, with only the answers with "gluten". The group_by function also does not function as expected, as it doesn't remove all the participants answers, only the rows that contain "gluten".
Here is the code I have now. I would like all of a participants answers tobe removed if they answered "gluten" anywhere in the question:)
my_data_raw_quest %>%
group_by(user_id) %>%
filter(
str_detect(dv, "(G|g)luten"))
Here is the table created from that code.
structure(list(session_id = c(53877, 53891, 54090, 54469, 54929,
55038, 55061, 55096, 55104, 55108, 55145, 57068, 57074, 57146,
57276, 57435, 57952, 58817), project_id = c(495, 495, 495, 495,
495, 495, 495, 495, 495, 495, 495, 495, 495, 495, 495, 495, 495,
495), quest_name = c("Sociodemographic", "Sociodemographic",
"Sociodemographic", "Sociodemographic", "Sociodemographic", "Sociodemographic",
"Sociodemographic", "Sociodemographic", "Sociodemographic", "Sociodemographic",
"Sociodemographic", "Sociodemographic", "Sociodemographic", "Sociodemographic",
"Sociodemographic", "Sociodemographic", "Sociodemographic", "Sociodemographic"
), quest_id = c(2189, 2189, 2189, 2189, 2189, 2189, 2189, 2189,
2189, 2189, 2189, 2189, 2189, 2189, 2189, 2189, 2189, 2189),
user_id = c(47667, 47681, 47877, 48251, 48705, 48816, 48839,
48873, 48881, 48881, 48921, 50663, 50723, 50794, 50924, 51077,
51561, 52161), user_sex = c("male", "female", "female", "female",
"female", "na", "female", "female", "female", "female", "female",
"female", "female", "female", "female", "female", "male",
"female"), user_status = c("test", "test", "guest", "guest",
"registered", "guest", "guest", "guest", "test", "test",
"guest", "registered", "guest", "guest", "guest", "guest",
"guest", "test"), user_age = c(59, 40, 35, 38, 53.7, 28,
21, 65, 24, 24, 25, 20.8, 38, 44, 32, 34, 44, 20), q_name = c("food allergies",
"food allergies", "food allergies", "food allergies", "food allergies",
"food allergies", "food allergies", "food allergies", "food allergies",
"food allergies", "food allergies", "food allergies", "food allergies",
"food allergies", "food allergies", "food allergies", "Other",
"food allergies"), q_id = c(92827397, 92827397, 92827397,
92827397, 92827397, 92827397, 92827397, 92827397, 92827397,
92827397, 92827397, 92827397, 92827397, 92827397, 92827397,
92827397, 92831398, 92827397), order = c(4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4), dv = c("Gluten", "Gluten, cumin, paprika, anchovies",
"Gluten intolerance", "Dairy, gluten some veg, fruit and nuts",
"Gluten", "Gluten", "Gluten intolerant", "Gluten", "No allergies, but intolerant to gluten",
"No allergies, but gluten intolerant", "Lactose & gluten",
"gluten and dairy intolerance", "Sensitive to gluten and soy",
"Gluten", "Gluten", "Gluten", "Locked down with family, sister is gluten free",
"I am conscious of what gluten i eat as it sets my eczema off"
), starttime = structure(c(1607970136, 1607970692, 1607975785,
1607984805, 1608023741, 1608037872, 1608041491, 1608047134,
1608048524, 1608048811, 1608055657, 1609950997, 1609951334,
1609953692, 1609961095, 1609976350, 1610182572, 1610465355
), tzone = "UTC", class = c("POSIXct", "POSIXt")), endtime = structure(c(1607970180,
1607970791, 1607975825, 1607984927, 1608023787, 1608037944,
1608041525, 1608047239, 1608048613, 1608048856, 1608055709,
1609951071, 1609951428, 1609953730, 1609961133, 1609976399,
1610182657, 1610465458), tzone = "UTC", class = c("POSIXct",
"POSIXt")), undergraduate = c(FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE), NoUni = c(FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE), Masters = c(FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE),
Postgraduate = c(FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE), degree = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
)), row.names = c(NA, -18L), groups = structure(list(user_id = c(47667,
47681, 47877, 48251, 48705, 48816, 48839, 48873, 48881, 48921,
50663, 50723, 50794, 50924, 51077, 51561, 52161), .rows = structure(list(
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9:10, 11L, 12L, 13L, 14L,
15L, 16L, 17L, 18L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -17L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))

R Function help to obtain only the Unique values to then obtain basic metrics

I am trying to obtain the unique values for Number.Full in the below.
n_distinct() brings me the distinct count of the Number.Full. But it doesn't feed that into the min()/max()/mean() counts.
I have tried putting distinct and unique as part of the filter() and placing it after the filter() as a new variable.
But I can't seem to get it to feed in/work properly.
Any help or suggestions are greatly welcome.
Edit 1 for dput data:
nRequests_byYearMth <- df_Raw_Data %>%
filter(Specimen.Number.Left.2 == "AB") %>%
group_by(Rec_Period_Month_Yr) %>%
summarise(Number.Full = n_distinct(Number.Full), min(TaT_Coll_to_Auth), max(TaT_Coll_to_Auth), mean(TaT_Coll_to_Auth)) %>%
arrange(Rec_Period_Month_Yr)
structure(list(Receive.Date = c("2019-09-20", "2019-09-20", "2019-06-24",
"2019-05-23", "2019-09-05", "2019-07-30"), Number.Full = c("04023119",
"04023119", "02634719", "02190819", "00273419",
"03234219"), Ex.No = c("", "", "19P08645QQ5",
"", "", ""), Order.Comment = c("CT11", "CT11", "HR", "SHU",
"", "ICCZZ"), Coll.Date.Source = c("1931-02-04", "1931-02-04",
"1949-01-04", "2000-12-23", "2012-09-05", "2015-05-02"), Location.Code = c("FH7895SS",
"FHSA785", "VB97S", "RV0158", "FH29567", "N1"), Loc.Des = c("FWC",
"FU", "VHB", "RDO",
"F29", "NSBRU"), Tissue.Code = c("LEX",
"LEX", "RC", "SKL", "NPL", "RC"), T.Name = c("ELung",
"ELung", "Referred", "Skin", "Pleural",
"Referred Case"), Current.Status = c("S", "S", "S", "S",
"S", "S"), Date.Updated = c("2019-10-20", "2019-10-20",
"2019-06-24", "2019-05-28", "2019-09-13", "2019-08-07"), Reporting.1 = c("LYNN",
"LYNN", "ROBCM", "HUSA", "SPOE", "CPATH"), Reporting.2 = c("MAJJ",
"MAJJ", "", "", "ROBB", ""), Reporting.3 = c("",
"", "", "", "FERB", ""), Reporting.4 = c("", "",
"", "", "", ""), Reporting.5 = c("", "", "", "",
"", ""), Number.Left.2 = c("AB", "AB", "AB", "AB", "CN",
"AB"), Auth_Period_Month_Yr = c("2019-10", "2019-10", "2019-06",
"2019-05", "2019-09", "2019-08"), Rec_Period_Month_Yr = c("2019-09",
"2019-09", "2019-06", "2019-05", "2019-09", "2019-07"), TaT_Coll_to_Auth = structure(c(32400,
32400, 25738, 6730, 2564, 1558), class = "difftime", units = "days"),
M.Weighting = c(50L, 50L, 0L, 30L, NA, 0L)), row.names = c(NA,
6L), class = "data.frame")
From the nRequests_byYearMth formula I was expecting it to filter() to only show the AB entries, then group those by the Rec_Period_Moth_Yr, when it was summerised I had it count the distinct entries (n_distinct())and then the min()/max()/mean() would also show the data relating to the filtered results.
But when I've used Excel to look at the data extract I'm using the it doesn't seem to be filtering correctly.
I am thinking that I need to have the filter applied to the summerise() somehow.
Edit with outputs:
The resulting output is:
structure(list(Rec_Period_Month_Yr = c("2019-04", "2019-05",
"2019-06", "2019-07", "2019-08", "2019-09", "2019-10", "2019-11",
"2019-12", "2020-01", "2020-02", "2020-03"), Specimen.Number.Full = c(4881L,
4929L, 4902L, 5289L, 4815L, 5043L, 5697L, 5051L, 4552L, 5434L,
4917L, 4556L), `min(TaT_Coll_to_Auth)` = structure(c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), class = "difftime", units = "days"),
`max(TaT_Coll_to_Auth)` = structure(c(368, 6730, 25738, 1558,
222, 32400, 374, 150, 320, 97, 382, 60), class = "difftime", units = "days"),
`mean(TaT_Coll_to_Auth)` = structure(c(9.80235422940049,
10.768904109589, 14.8278848840458, 10.0686706074708, 10.2533425223983,
19.6828624240824, 11.8121527777778, 10.4033579583613, 10.4007004231723,
9.04840344652813, 8.94940393678958, 8.2197571578474), class = "difftime", units = "days")), row.names = c(NA,
-12L), class = c("tbl_df", "tbl", "data.frame"))
The expected output I want is below. But I can only create this if I only look at the AB entries and Summarise() doesn't seem to do that for the min()/max()/mean() and instead looks at the entire entires for the column.
I need it to look at only the entries relating to the AB filter() (for all the summarised items.)
(The last Max entry shows as 60 in R but if properly filtered would show as 50)
structure(list(Year.and.Mth = c("2019-4", "2019-5", "2019-6",
"2019-7", "2019-8", "2019-9", "2019-10", "2019-11", "2019-12",
"2020-1", "2020-2", "2020-3"), Number.Full = c(4881, 4929, 4902,
5289, 4815, 5043, 5697, 5051, 4552, 5434, 4917, 4556), Max = c(113,
6730, 25738, 1558, 156, 32400, 374, 109, 320, 97, 382, 50), Mean = c(7.97705388240115,
9.34286873605194, 13.514891880865, 8.39194554736245, 7.72294911734164,
15.2502478683323, 9.15850447604002, 8.85389031874876, 9.00021968365554,
7.76573426573427, 7.97335773845841, 7.350526778)), class = "data.frame", row.names = c(NA,
-12L))

Resources