R:Duplicate column with NA values created with bind_rows - r

The combining of multiple df using bind_rows produced an unwanted duplicate column in the resulting df:
all_trips_raw <-
bind_rows(X2020_08, X2020_06, X2020_05,
X2020_04, X2020_03, X2020_02, X2020_01,
X2019_12_Dur, X2019_11, X2019_10, X2019_09)
Where the X df were the result of an import of 12 csv files that were uploaded into R studio cloud-both the csv files and resulting df's have 19 columns. The column in question is Distance_Miles. The column with data came from the separate data frames, and the new df has one with NA.
structure(list(RouteID = c(13442256, 13442257, 13442261, 13442275,
13442279), PaymentPlan = c("Casual", "Casual", "Subscriber",
"Subscriber", "Casual"), StartHub = c("SW Yamhill at Director Park",
"SW Yamhill at Director Park", NA, "NW Station at Irving", NA
), StartLatitude = c(45.51898132, 45.51898132, 45.5133558, 45.5282777,
45.5167987), StartLongitude = c(-122.6812685, -122.6812685, -122.6828884,
-122.6766282, -122.6729466), StartDate = c("8/1/2020", "8/1/2020",
"8/1/2020", "8/1/2020", "8/1/2020"), StartTime = structure(c(240,
300, 480, 1680, 2040), class = c("hms", "difftime"), units = "secs"),
EndHub = c("SW Yamhill at Director Park", "SW Yamhill at Director Park",
NA, NA, "SE Ladd at Hawthorne"), EndLatitude = c(45.51898132,
45.51898132, 45.5252069, 45.5266354, 45.5120818), EndLongitude = c(-122.6812685,
-122.6812685, -122.6765159, -122.6765624, -122.6533493),
EndDate = c("8/1/2020", "8/1/2020", "8/1/2020", "8/1/2020",
"8/1/2020"), EndTime = structure(c(2100, 2100, 1260, 1740,
2820), class = c("hms", "difftime"), units = "secs"), TripType = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), BikeID = c(5995, 6380, 7317, 6177, 6632), BikeName = c("0916 BIKETOWN",
"0694 BIKETOWN", "9890 ASCEND BIKE", "0367 PBOT BIKETOWN",
"0278 BIKETOWN"), Distance_Miles_ = c(1.85, 1.88, 1.05, 0.11,
1.27), Duration = structure(c(1837, 1771, 768, 110, 782), class = c("hms",
"difftime"), units = "secs"), RentalAccessPath = c("keypad",
"keypad", "keypad", "keypad", "mobile"), MultipleRental = c(FALSE,
FALSE, FALSE, FALSE, FALSE), Distance_Miles = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_)), row.names = c(NA,
-5L), class = c("tbl_df", "tbl", "data.frame"))
Would importing csv as a dyplr data_frame make a difference, when using bind_rows, instead of the base data.frame implementation?
Should the bind_rows statement been written differently, to prevent the duplicate column with NA values?
I also tried this to remove the added column:
# Find Duplicate Column Names
duplicated_names <- duplicated(colnames(my_df))
# Remove Duplicate Column Names
my_df[!duplicated_names]
where my_df was all_trips_raw

Related

R:Converting char to time (duration(hms))

I'm attempting to change a column type from char to time and am not getting the results expected. Description and dput output follow:
I'm attempting to change the column called Duration from char to time:
structure(list(RouteID = c(12817402, 12817404, 12817406, 12817425,
12817426, 12817446, 12817447, 12817455, 12817481, 12817499, 12817599,
12817603, 12817631, 12817636, 12817655), PaymentPlan = c("Subscriber",
"Casual", "Subscriber", "Casual", "Casual", "Casual", "Subscriber",
"Subscriber", "Casual", "Casual", "Subscriber", "Casual", "Casual",
"Casual", "Casual"), StartHub = c("NW Johnson at Jamison Square",
"SE Ladd at Hawthorne", NA, "SE 50th at Clinton", NA, "SE 30th at Division",
"SW Morrison at 18th", "NE 42nd at Hancock", NA, NA, "SW River at Montgomery",
NA, "NW Flanders at 14th", NA, NA), StartLatitude = c(45.5286366,
45.5120818, 45.522783, 45.503506, 45.5086555, 45.50468892, 45.52196048,
45.536898, 45.5041753, 45.5014807, 45.50910258, 45.5233209, 45.52579919,
45.5315017, 45.5218677), StartLongitude = c(-122.6820195, -122.6533493,
-122.6811195, -122.611066, -122.6547299, -122.6345551, -122.6896772,
-122.619969, -122.6612413, -122.6557978, -122.6735169, -122.6963404,
-122.6855063, -122.6834541, -122.6747676), StartDate = c("12/1/2019",
"12/1/2019", "12/1/2019", "12/1/2019", "12/1/2019", "12/1/2019",
"12/1/2019", "12/1/2019", "12/1/2019", "12/1/2019", "12/1/2019",
"12/1/2019", "12/1/2019", "12/1/2019", "12/1/2019"), StartTime = structure(c(1020,
1140, 1200, 2700, 2880, 6660, 6960, 8280, 13020, 15900, 23040,
23340, 24780, 24840, 25500), class = c("hms", "difftime"), units = "secs"),
EndHub = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "NW Couch at 11th",
"SE 2nd Pl at Tilikum Way", "NW Raleigh at 21st", NA, "SW 5th at Morrison"
), EndLatitude = c(45.524531, 45.5086555, 45.5090834, 45.5422432,
45.5041753, 45.5034396, 45.5312952, 45.5317187, 45.5014807,
45.5041753, 45.52374151, 45.50624163, 45.53409115, 45.5144089,
45.51889487), EndLongitude = c(-122.6744613, -122.6547299,
-122.6840225, -122.604573, -122.6612413, -122.639666, -122.6946193,
-122.6306539, -122.6557978, -122.6612413, -122.6818129, -122.6633379,
-122.6949424, -122.6840143, -122.6774061), EndDate = c("12/1/2019",
"12/1/2019", "12/1/2019", "12/1/2019", "12/1/2019", "12/1/2019",
"12/1/2019", "12/1/2019", "12/1/2019", "12/1/2019", "12/1/2019",
"12/1/2019", "12/1/2019", "12/1/2019", "12/1/2019"), EndTime = structure(c(1740,
1440, 1860, 4200, 3780, 6840, 8220, 10560, 13380, 16500,
23880, 24420, 25140, 26100, 25800), class = c("hms", "difftime"
), units = "secs"), TripType = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), BikeID = c(6516, 24819, 7342, 6636, 24819,
7109, 6396, 6082, 24819, 24819, 6464, 7252, 6310, 6040, 6249
), BikeName = c("0060 BIKETOWN", "0199 BIKETOWN", "1016 LATINX HERITAGE MONTH",
"0825 BIKETOWN", "0199 BIKETOWN", "0749 BIKETOWN", "0707 BIKETOWN",
"0084 BIKETOWN", "0199 BIKETOWN", "0199 BIKETOWN", "0559 BIKETOWN",
"0845 BIKETOWN", "0868 BIKETOWN", "0901 BIKETOWN", "0300 BIKETOWN"
), Distance_Miles = c(0.72, 0.44, 1.18, 2.99, 1.36, 0.31,
0.89, 0.71, 0.58, 0.83, 1.5, 2.46, 1.02, 1.66, 0.34), Duration = c("0:11:10",
"0:05:18", "0:11:31", "0:24:28", "0:15:11", "0:03:11", "0:21:02",
"0:37:45", "0:05:38", "0:10:26", "0:13:33", "0:17:27", "0:06:42",
"0:20:58", "0:04:51"), RentalAccessPath = c("keypad", "mobile",
"mobile", "keypad", "mobile", "mobile", "keypad", "mobile",
"mobile", "mobile", "keypad", "keypad", "keypad", "keypad",
"keypad_rfid_card"), MultipleRental = c(FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE)), row.names = c(NA, -15L), class = c("tbl_df",
"tbl", "data.frame"))
I ran this:
library(lubridate)
X2020_07 <- as.duration(hms(X2020_07$Duration))
where X df is structured similar to the dput df above.
The type was changed from char to time but there is only 1 column!:
new("Duration", .Data = c(1687, 73, 1499, 691, 475, 350, 538,
3018, 1594, 2447, 1160, 1185, 469, 1090, 424))
The goal is to preserve the 19 column df when it's piped to another df of the same name , i.e. X2020_07 <- X2020 while changing the column type to time.
Given my limited knowledge of R (I am enjoying learning) unsure what else I should be looking into. Appreciate guidance and solutions!
Assuming as.duration results in the correct column type.
Here is a solution using the pipe operator of dplyr:
library(dplyr)
library(lubridate)
X2020_07 <- X2020 %>%
dplyr::mutate(Duration = lubridate::as.duration(lubridate::hms(Duration)))

R Function help to obtain only the Unique values to then obtain basic metrics

I am trying to obtain the unique values for Number.Full in the below.
n_distinct() brings me the distinct count of the Number.Full. But it doesn't feed that into the min()/max()/mean() counts.
I have tried putting distinct and unique as part of the filter() and placing it after the filter() as a new variable.
But I can't seem to get it to feed in/work properly.
Any help or suggestions are greatly welcome.
Edit 1 for dput data:
nRequests_byYearMth <- df_Raw_Data %>%
filter(Specimen.Number.Left.2 == "AB") %>%
group_by(Rec_Period_Month_Yr) %>%
summarise(Number.Full = n_distinct(Number.Full), min(TaT_Coll_to_Auth), max(TaT_Coll_to_Auth), mean(TaT_Coll_to_Auth)) %>%
arrange(Rec_Period_Month_Yr)
structure(list(Receive.Date = c("2019-09-20", "2019-09-20", "2019-06-24",
"2019-05-23", "2019-09-05", "2019-07-30"), Number.Full = c("04023119",
"04023119", "02634719", "02190819", "00273419",
"03234219"), Ex.No = c("", "", "19P08645QQ5",
"", "", ""), Order.Comment = c("CT11", "CT11", "HR", "SHU",
"", "ICCZZ"), Coll.Date.Source = c("1931-02-04", "1931-02-04",
"1949-01-04", "2000-12-23", "2012-09-05", "2015-05-02"), Location.Code = c("FH7895SS",
"FHSA785", "VB97S", "RV0158", "FH29567", "N1"), Loc.Des = c("FWC",
"FU", "VHB", "RDO",
"F29", "NSBRU"), Tissue.Code = c("LEX",
"LEX", "RC", "SKL", "NPL", "RC"), T.Name = c("ELung",
"ELung", "Referred", "Skin", "Pleural",
"Referred Case"), Current.Status = c("S", "S", "S", "S",
"S", "S"), Date.Updated = c("2019-10-20", "2019-10-20",
"2019-06-24", "2019-05-28", "2019-09-13", "2019-08-07"), Reporting.1 = c("LYNN",
"LYNN", "ROBCM", "HUSA", "SPOE", "CPATH"), Reporting.2 = c("MAJJ",
"MAJJ", "", "", "ROBB", ""), Reporting.3 = c("",
"", "", "", "FERB", ""), Reporting.4 = c("", "",
"", "", "", ""), Reporting.5 = c("", "", "", "",
"", ""), Number.Left.2 = c("AB", "AB", "AB", "AB", "CN",
"AB"), Auth_Period_Month_Yr = c("2019-10", "2019-10", "2019-06",
"2019-05", "2019-09", "2019-08"), Rec_Period_Month_Yr = c("2019-09",
"2019-09", "2019-06", "2019-05", "2019-09", "2019-07"), TaT_Coll_to_Auth = structure(c(32400,
32400, 25738, 6730, 2564, 1558), class = "difftime", units = "days"),
M.Weighting = c(50L, 50L, 0L, 30L, NA, 0L)), row.names = c(NA,
6L), class = "data.frame")
From the nRequests_byYearMth formula I was expecting it to filter() to only show the AB entries, then group those by the Rec_Period_Moth_Yr, when it was summerised I had it count the distinct entries (n_distinct())and then the min()/max()/mean() would also show the data relating to the filtered results.
But when I've used Excel to look at the data extract I'm using the it doesn't seem to be filtering correctly.
I am thinking that I need to have the filter applied to the summerise() somehow.
Edit with outputs:
The resulting output is:
structure(list(Rec_Period_Month_Yr = c("2019-04", "2019-05",
"2019-06", "2019-07", "2019-08", "2019-09", "2019-10", "2019-11",
"2019-12", "2020-01", "2020-02", "2020-03"), Specimen.Number.Full = c(4881L,
4929L, 4902L, 5289L, 4815L, 5043L, 5697L, 5051L, 4552L, 5434L,
4917L, 4556L), `min(TaT_Coll_to_Auth)` = structure(c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), class = "difftime", units = "days"),
`max(TaT_Coll_to_Auth)` = structure(c(368, 6730, 25738, 1558,
222, 32400, 374, 150, 320, 97, 382, 60), class = "difftime", units = "days"),
`mean(TaT_Coll_to_Auth)` = structure(c(9.80235422940049,
10.768904109589, 14.8278848840458, 10.0686706074708, 10.2533425223983,
19.6828624240824, 11.8121527777778, 10.4033579583613, 10.4007004231723,
9.04840344652813, 8.94940393678958, 8.2197571578474), class = "difftime", units = "days")), row.names = c(NA,
-12L), class = c("tbl_df", "tbl", "data.frame"))
The expected output I want is below. But I can only create this if I only look at the AB entries and Summarise() doesn't seem to do that for the min()/max()/mean() and instead looks at the entire entires for the column.
I need it to look at only the entries relating to the AB filter() (for all the summarised items.)
(The last Max entry shows as 60 in R but if properly filtered would show as 50)
structure(list(Year.and.Mth = c("2019-4", "2019-5", "2019-6",
"2019-7", "2019-8", "2019-9", "2019-10", "2019-11", "2019-12",
"2020-1", "2020-2", "2020-3"), Number.Full = c(4881, 4929, 4902,
5289, 4815, 5043, 5697, 5051, 4552, 5434, 4917, 4556), Max = c(113,
6730, 25738, 1558, 156, 32400, 374, 109, 320, 97, 382, 50), Mean = c(7.97705388240115,
9.34286873605194, 13.514891880865, 8.39194554736245, 7.72294911734164,
15.2502478683323, 9.15850447604002, 8.85389031874876, 9.00021968365554,
7.76573426573427, 7.97335773845841, 7.350526778)), class = "data.frame", row.names = c(NA,
-12L))

How to create a for loop based on unique user IDs and specific event types

I have two data frames: users and events.
Both data frames contain a field that links events to users.
How can I create a for loop where every user's unique ID is matched against an event of a particular type and then stores the number of occurrences into a new column within users (users$conversation_started, users$conversation_missed, etc.)?
In short, it is a conditional for loop.
So far I have this but it is wrong:
for(i in users$id){
users$conversation_started <- nrow(event[event$type = "conversation-started"])
}
An example of how to do this would be ideal.
The idea is:
for(each user)
find the matching user ID in events
count the number of event types == "conversation-started"
assign count value to user$conversation_started
end for
Important note:
The type field can contain one of five values so I will need to be able to effectively filter on each type for each associate:
> events$type %>% table %>% as.matrix
[,1]
conversation-accepted 3120
conversation-already-accepted 19673
conversation-declined 27
conversation-missed 831
conversation-request 23427
Data frames (note that these are reduced versions as confidential information has been removed):
users <- structure(list(`_id` = c("JTuXhdI4Ai", "iGIeCEXyVE", "6XFtOJh0bD",
"mNN986oQv9", "9NI71KBMX9", "x1jH7t0Cmy"), language = c("en",
"en", "en", "en", "en", "en"), registering = c(TRUE, TRUE, FALSE,
FALSE, FALSE, NA), `_created_at` = structure(c(1485995043.131,
1488898839.838, 1480461193.146, 1481407887.979, 1489942757.189,
1491311381.916), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
`_updated_at` = structure(c(1521039527.236, 1488898864.834,
1527618624.877, 1481407959.116, 1490043838.561, 1491320333.09
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), lastOnlineTimestamp = c(1521039526.90314,
NA, 1480461472, 1481407959, 1490043838, NA), isAgent = c(FALSE,
NA, FALSE, FALSE, FALSE, NA), lastAvailableTime = structure(c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), class = c("POSIXct",
"POSIXt"), tzone = ""), available = c(NA, NA, NA, NA, NA,
NA), busy = c(NA, NA, NA, NA, NA, NA), joinedTeam = structure(c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), class = c("POSIXct",
"POSIXt"), tzone = ""), timezone = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
)), row.names = c("list.1", "list.2", "list.3", "list.4",
"list.5", "list.6"), class = "data.frame")
and
events <- structure(list(`_id` = c("JKY8ZwkM1S", "CG7Xj8dAsA", "pUkFFxoahy",
"yJVJ34rUCl", "XxXelkIFh7", "GCOsENVSz6"), expirationTime = structure(c(1527261147.873,
NA, 1527262121.332, NA, 1527263411.619, 1527263411.619), class = c("POSIXct",
"POSIXt"), tzone = ""), partId = c("d22bfddc-cd51-489f-aec8-5ab9225c0dd5",
"d22bfddc-cd51-489f-aec8-5ab9225c0dd5", "cf4356da-b63e-4e4d-8e7b-fb63035801d8",
"cf4356da-b63e-4e4d-8e7b-fb63035801d8", "a720185e-c300-47c0-b30d-64e1f272d482",
"a720185e-c300-47c0-b30d-64e1f272d482"), type = c("conversation-request",
"conversation-accepted", "conversation-request", "conversation-accepted",
"conversation-request", "conversation-request"), `_p_conversation` = c("Conversation$6nSaLeWqs7",
"Conversation$6nSaLeWqs7", "Conversation$6nSaLeWqs7", "Conversation$6nSaLeWqs7",
"Conversation$bDuAYSZgen", "Conversation$bDuAYSZgen"), `_p_merchant` = c("Merchant$0A2UYADe5x",
"Merchant$0A2UYADe5x", "Merchant$0A2UYADe5x", "Merchant$0A2UYADe5x",
"Merchant$0A2UYADe5x", "Merchant$0A2UYADe5x"), `_p_associate` = c("D9ihQOWrXC",
"D9ihQOWrXC", "D9ihQOWrXC", "D9ihQOWrXC", "D9ihQOWrXC", "D9ihQOWrXC"
), `_wperm` = list(list(), list(), list(), list(), list(), list()),
`_rperm` = list("*", "*", "*", "*", "*", "*"), `_created_at` = structure(c(1527264657.998,
1527264662.043, 1527265661.846, 1527265669.435, 1527266922.056,
1527266922.059), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
`_updated_at` = structure(c(1527264657.998, 1527264662.043,
1527265661.846, 1527265669.435, 1527266922.056, 1527266922.059
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), read = c(TRUE,
NA, TRUE, NA, NA, NA), data.customerName = c("Shopper 109339",
NA, "Shopper 109339", NA, "Shopper 109364", "Shopper 109364"
), data.departmentName = c("Personal advisors", NA, "Personal advisors",
NA, "Personal advisors", "Personal advisors"), data.recurring = c(FALSE,
NA, TRUE, NA, FALSE, FALSE), data.new = c(TRUE, NA, FALSE,
NA, TRUE, TRUE), data.missed = c(0L, NA, 0L, NA, 0L, 0L),
data.customerId = c("84uOFRLmLd", "84uOFRLmLd", "84uOFRLmLd",
"84uOFRLmLd", "5Dw4iax3Tj", "5Dw4iax3Tj"), data.claimingTime = c(NA,
4L, NA, 7L, NA, NA), data.lead = c(NA, NA, FALSE, NA, NA,
NA), data.maxMissed = c(NA, NA, NA, NA, NA, NA), data.associateName = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), data.maxDecline = c(NA, NA, NA, NA, NA, NA
), data.goUnavailable = c(NA, NA, NA, NA, NA, NA)), row.names = c("list.1",
"list.2", "list.3", "list.4", "list.5", "list.6"), class = "data.frame")
Update: 21st September 2018
This solution now results in an NA-only data frame being produced at the end of the function. When written to a .csv, this is what I get (naturally, Excel displays NA-values as blank values):
My data source has not changed, nor has my script.
What might be causing this?
My guess is that this is an unforeseen case where there may have been 0 hits for each step has occurred; as such, is there a way to add 0 to those cases where there weren't any hits, rather than NA/ blank values?
Is there a way to avoid this?
New solution based on the provided data.
Note: As your data had no overlap in _id, I changed the events$_id to be the same as in users.
Simplified example data:
users <- structure(list(`_id` = structure(c(4L, 3L, 1L, 5L, 2L, 6L),
.Label = c("6XFtOJh0bD", "9NI71KBMX9", "iGIeCEXyVE",
"JTuXhdI4Ai", "mNN986oQv9", "x1jH7t0Cmy"),
class = "factor")), .Names = "_id",
row.names = c(NA, -6L), class = "data.frame")
events <- structure(list(`_id` = c("JKY8ZwkM1S", "CG7Xj8dAsA", "pUkFFxoahy",
"yJVJ34rUCl", "XxXelkIFh7", "GCOsENVSz6"),
type = c("conversation-request", "conversation-accepted",
"conversation-request", "conversation-accepted",
"conversation-request", "conversation-request")),
.Names = c("_id", "type"), class = "data.frame",
row.names = c("list.1", "list.2", "list.3", "list.4", "list.5", "list.6"))
events$`_id` <- users$`_id`
> users
_id
1 JTuXhdI4Ai
2 iGIeCEXyVE
3 6XFtOJh0bD
4 mNN986oQv9
5 9NI71KBMX9
6 x1jH7t0Cmy
> events
_id type
list.1 JTuXhdI4Ai conversation-request
list.2 iGIeCEXyVE conversation-accepted
list.3 6XFtOJh0bD conversation-request
list.4 mNN986oQv9 conversation-accepted
list.5 9NI71KBMX9 conversation-request
list.6 x1jH7t0Cmy conversation-request
We can use the same approach I suggested before, just enhance it a bit.
First we loop over unique(events$type) to store a table() of every type of event per id in a list:
test <- lapply(unique(events$type), function(x) table(events$`_id`, events$type == x))
Then we store the specific type as the name of the respective table in the list:
names(test) <- unique(events$type)
Now we use a simple for-loop to match() the user$_id with the rownames of the table and store the information in a new variable with the name of the event type:
for(i in names(test)){
users[, i] <- test[[i]][, 2][match(users$`_id`, rownames(test[[i]]))]
}
Result:
> users
_id conversation-request conversation-accepted
1 JTuXhdI4Ai 1 0
2 iGIeCEXyVE 0 1
3 6XFtOJh0bD 1 0
4 mNN986oQv9 0 1
5 9NI71KBMX9 1 0
6 x1jH7t0Cmy 1 0
Hope this helps!

Extract specific columns from dataset, create column of NAs if it doesn't exist

Data frame df has 57 columns. I later read in other csv files, each of which may have the same 57, but more likely have more or fewer columns. I take the names of the original file as:
df = read.csv(...)
str = colnames(df)
I know I can take subsets of a data frame as:
file = read.csv(...)
file = file[, str]
If the columns of file have the same or greater number of columns than the original 57, this will work fine. The extra columns would simply be dropped. However, if the columns of file are fewer than the original 57, the following error arises:
Error in `[.data.frame`(file, , str) : undefined columns selected
Is there a way to take this same approach, but create columns of NA if the column does not exist in file?
EDIT: Including dput ouput for #akrun. I'm not familiar with dput so I hope this is what you were asking for:
File 1 example:
`structure(list(ObservationURI = c("http://resources.usgin.org/uri-gin/wygs/bhtemp/49-037-20341_182_12296/",
"http://resources.usgin.org/uri-gin/wygs/bhtemp/49-037-20341_215_14316/",
"http://resources.usgin.org/uri-gin/wygs/bhtemp/49-037-20341_236_16496/"
), WellName = c("1 BRADY UNIT ANADARKO E&P COMPANY LP", "1 BRADY UNIT ANADARKO E&P COMPANY LP",
"1 BRADY UNIT ANADARKO E&P COMPANY LP"), APINo = c("49-037-20341",
"49-037-20341", "49-037-20341"), HeaderURI = c("http://resources.usgin.org/uri-gin/wygs/well/3720341/",
"http://resources.usgin.org/uri-gin/wygs/well/3720341/", "http://resources.usgin.org/uri-gin/wygs/well/3720341/"
), OtherID = c(3720341, 3720341, 3720341), OtherName = c(NA,
NA, NA), BoreholeName = c(NA, NA, NA), Label = c("Temperature observation for well 3720341",
"Temperature observation for well 3720341", "Temperature observation for well 3720341"
), Operator = c("", "", ""), LeaseName = c("", "", ""), LeaseOwner = c("",
"", ""), LeaseNo = c("", "", ""), SpudDate = c("1900-01-01T00:00",
"1900-01-01T00:00", "1900-01-01T00:00"), EndedDrillingDate = c("",
"", ""), WellType = c("Oil", "Oil", "Oil"), Status = c("Producing Oil Well",
"Producing Oil Well", "Producing Oil Well"), CommodityOfInterest = c("",
"", ""), StatusDate = c("1973-05-03T00:00:00", "1973-05-03T00:00:00",
"1973-05-03T00:00:00"), Function = c(NA, NA, NA), Production = c(NA,
NA, NA), ProducingInterval = c(NA, NA, NA), ReleaseDate = c(NA,
NA, NA), Field = c("", "", ""), OtherLocationName = c("Great Divide Basin",
"Great Divide Basin", "Great Divide Basin"), County = c("Sweetwater",
"Sweetwater", "Sweetwater"), State = c("WY", "WY", "WY"), PLSS_Meridians = c(NA,
NA, NA), TWP = c("16N", "16N", "16N"), RGE = c("101W", "101W",
"101W"), Section_ = c(11, 11, 11), SectionPart = c("NENW", "NENW",
"NENW"), Parcel = c(NA, NA, NA), UTM_E = c(NA, NA, NA), UTM_N = c(NA,
NA, NA), UTMDatumZone = c(NA, NA, NA), LatDegree = c(41.38696,
41.38696, 41.38696), LongDegree = c(-108.75009, -108.75009, -108.75009
), SRS = c("EPSG:4326", "EPSG:4326", "EPSG:4326"), LocationUncertaintyStatement = c("nil:missing",
"nil:missing", "nil:missing"), LocationUncertaintyCode = c(NA,
NA, NA), LocationUncertaintyRadius = c(NA, NA, NA), DrillerTotalDepth = c(NA_real_,
NA_real_, NA_real_), DepthReferencePoint = c(NA, NA, NA), LengthUnits = c("ft",
"ft", "ft"), WellBoreShape = c(NA, NA, NA), TrueVerticalDepth = c(NA,
NA, NA), ElevationKB = c(7135, 7135, 7135), ElevationDF = c(7106,
7106, 7106), ElevationGL = c(0, 0, 0), FormationTD = c("", "",
""), BitDiameterCollar = c(NA, NA, NA), BitDiameterTD = c(NA_real_,
NA_real_, NA_real_), DiameterUnits = c("", "", ""), Notes = c("Depth of measurement assumed to be equal to driller total depth (CRC-AZGS, 2013).",
"Depth of measurement assumed to be equal to driller total depth (CRC-AZGS, 2013).",
"Depth of measurement assumed to be equal to driller total depth (CRC-AZGS, 2013)."
), MaximumRecordedTemperature = c(NA_real_, NA_real_, NA_real_
), MeasuredTemperature = c(182, 215, 236), CorrectedTemperature = c(NA_real_,
NA_real_, NA_real_), TemperatureUnits = c(FALSE, FALSE, FALSE
), TimeSinceCirculation = c(NA_real_, NA_real_, NA_real_), CirculationDuration = c(11,
12, 12), MeasurementProcedure = c("Well log", "Well log", "Well log"
), CorrectionType = c(NA, NA, NA), DepthOfMeasurement = c(-99999,
-99999, -99999), MeasurementDateTime = c("", "", ""), MeasurementFormation = c("",
"", ""), MeasurementSource = c("Richard W. Davis: Deriving geothermal parameters from bottom-hole temperatures in Wyoming\" AAPG bulletin, V. 96, No. 8 (August 2012), pp. 1579-1592",
"Richard W. Davis: Deriving geothermal parameters from bottom-hole temperatures in Wyoming\" AAPG bulletin, V. 96, No. 8 (August 2012), pp. 1579-1592",
"Richard W. Davis: Deriving geothermal parameters from bottom-hole temperatures in Wyoming\" AAPG bulletin, V. 96, No. 8 (August 2012), pp. 1579-1592"
), RelatedResource = c(NA, NA, NA), CasingLogger = c(NA, NA,
NA), CasingBottomDepthDriller = c(NA, NA, NA), CasingTopDepth = c(NA_real_,
NA_real_, NA_real_), CasingPipeDiameter = c(NA, NA, NA), CasingWeight = c(NA,
NA, NA), CasingWeightUnits = c(NA, NA, NA), CasingThickness = c(NA,
NA, NA), DrillingFluid = c("", "", ""), Salinity = c(NA_real_,
NA_real_, NA_real_), MudResistivity = c(NA_real_, NA_real_, NA_real_
), Density = c(NA_real_, NA_real_, NA_real_), FluidLevel = c(NA_real_,
NA_real_, NA_real_), pH = c(NA_real_, NA_real_, NA_real_), Viscosity = c(NA_real_,
NA_real_, NA_real_), FluidLoss = c(NA_real_, NA_real_, NA_real_
), MeasurementNotes = c(NA, NA, NA), InformationSource = c("Wyoming State Geological Survey",
"Wyoming State Geological Survey", "Wyoming State Geological Survey"
)), .Names = c("ObservationURI", "WellName", "APINo", "HeaderURI",
"OtherID", "OtherName", "BoreholeName", "Label", "Operator",
"LeaseName", "LeaseOwner", "LeaseNo", "SpudDate", "EndedDrillingDate",
"WellType", "Status", "CommodityOfInterest", "StatusDate", "Function",
"Production", "ProducingInterval", "ReleaseDate", "Field", "OtherLocationName",
"County", "State", "PLSS_Meridians", "TWP", "RGE", "Section_",
"SectionPart", "Parcel", "UTM_E", "UTM_N", "UTMDatumZone", "LatDegree",
"LongDegree", "SRS", "LocationUncertaintyStatement", "LocationUncertaintyCode",
"LocationUncertaintyRadius", "DrillerTotalDepth", "DepthReferencePoint",
"LengthUnits", "WellBoreShape", "TrueVerticalDepth", "ElevationKB",
"ElevationDF", "ElevationGL", "FormationTD", "BitDiameterCollar",
"BitDiameterTD", "DiameterUnits", "Notes", "MaximumRecordedTemperature",
"MeasuredTemperature", "CorrectedTemperature", "TemperatureUnits",
"TimeSinceCirculation", "CirculationDuration", "MeasurementProcedure",
"CorrectionType", "DepthOfMeasurement", "MeasurementDateTime",
"MeasurementFormation", "MeasurementSource", "RelatedResource",
"CasingLogger", "CasingBottomDepthDriller", "CasingTopDepth",
"CasingPipeDiameter", "CasingWeight", "CasingWeightUnits", "CasingThickness",
"DrillingFluid", "Salinity", "MudResistivity", "Density", "FluidLevel",
"pH", "Viscosity", "FluidLoss", "MeasurementNotes", "InformationSource"
), row.names = c(NA, 3L), class = "data.frame")`
File 2 example:
`structure(list(ObservationURI = c("http://resources.usgin.org/uri-gin/mags/bhtemp/UM:MA-Weston47-422036N0711640.1/",
"http://resources.usgin.org/uri-gin/mags/bhtemp/UM:MA-Dover20-421431N0711752.1/",
"http://resources.usgin.org/uri-gin/mags/bhtemp/UM:MA-Lincoln13-422440N0711815.1/"
), WellName = c("Weston47-USGS HDR19", "Dover20-USGS HDR19",
"Lincoln13-USGS HDR19"), APINo = c(NA, NA, NA), HeaderURI = c("http://resources.usgin.org/uri-gin/mags/well/Weston47-USGS_HDR19/",
"http://resources.usgin.org/uri-gin/mags/well/Dover20-USGS_HDR19/",
"http://resources.usgin.org/uri-gin/mags/well/Lincoln13-USGS_HDR19/"
), OtherID = c("", "", ""), OtherName = c("", "", ""), BoreholeName = c(NA,
NA, NA), Operator = c(NA, NA, NA), LeaseOwner = c(NA, NA, NA),
LeaseNo = c(NA, NA, NA), SpudDate = c(NA, NA, NA), EndedDrillingDate = c("",
"", ""), WellType = c("temporarily abandoned", "observation",
"observation"), Status = c("Idle", "Idle", "Idle"), CommodityOfInterest = c("Water",
"Water", "Water"), StatusDate = c("", "", ""), Function = c("production",
"monitoring", "monitoring"), Production = c(NA, NA, NA),
Field = c(NA, NA, NA), County = c("Middlesex", "Norfolk",
"Middlesex"), State = c("MA", "MA", "MA"), PLSS_Meridians = c(NA,
NA, NA), TWP = c(NA, NA, NA), RGE = c(NA, NA, NA), Section_ = c(NA,
NA, NA), SectionPart = c(NA, NA, NA), Parcel = c(NA, NA,
NA), UTM_E = c(NA, NA, NA), UTM_N = c(NA, NA, NA), LatDegree = c(42.3147771183,
42.2417748607, 42.4110851252), LongDegree = c(-71.3257301787,
-71.2975422044, -71.3034583949), SRS = c("EPSG:4326", "EPSG:4326",
"EPSG:4326"), LocationUncertaintyStatement = c("Field located on topographic map",
"Field located on topographic map", "Field located on topographic map"
), DrillerTotalDepth = c(29, 22, 20), LengthUnits = c("ft",
"ft", "ft"), WellBoreShape = c("Vertical", "Vertical", "Vertical"
), TrueVerticalDepth = c(NA, NA, NA), ElevationGL = c(140,
150, 180), BitDiameterTD = c(72, 48, 42), DiameterUnits = c("in",
"in", "in"), Notes = c("", "", ""), MeasuredTemperature = c(8,
9, 8.5), CorrectedTemperature = c(NA, NA, NA), TemperatureUnits = c("C",
"C", "C"), TimeSinceCirculation = c(NA, NA, NA), CirculationDuration = c(NA,
NA, NA), MeasurementProcedure = c("Samples collected from spigot or faucet nearest to well. Water run until temperature, pH or specific conductance stablized. Temperature measured with a mercury thermometer to nearest half degree in degrees F. Converted to degrees C for table.",
"Samples collected from spigot or faucet nearest to well. Water run until temperature, pH or specific conductance stablized. Temperature measured with a mercury thermometer to nearest half degree in degrees F. Converted to degrees C for table.",
"Samples collected from spigot or faucet nearest to well. Water run until temperature, pH or specific conductance stablized. Temperature measured with a mercury thermometer to nearest half degree in degrees F. Converted to degrees C for table."
), CorrectionType = c(NA, NA, NA), DepthOfMeasurement = c(NA,
NA, NA), MeasurementDateTime = c(NA, NA, NA), MeasurementFormation = c(NA,
NA, NA), MeasurementSource = c("Walker, Eugene H., William W. Caswell, and S. William Wandle, Jr. Hydrologic Data of the Charles River Basin",
"Walker, Eugene H., William W. Caswell, and S. William Wandle, Jr. Hydrologic Data of the Charles River Basin",
"Walker, Eugene H., William W. Caswell, and S. William Wandle, Jr. Hydrologic Data of the Charles River Basin"
), CasingLogger = c(" Massachusetts\". USGS Massachusetts Hydrologic-Data Report No. 19 (1977): 1-57. Print. ftp://eclogite.geo.umass.edu/pub/stategeologist/Products/Geothermal/BoreholeTemperatureData/DataReport19.pdf\"",
" Massachusetts\". USGS Massachusetts Hydrologic-Data Report No. 19 (1977): 1-57. Print. ftp://eclogite.geo.umass.edu/pub/stategeologist/Products/Geothermal/BoreholeTemperatureData/DataReport19.pdf\"",
" Massachusetts\". USGS Massachusetts Hydrologic-Data Report No. 19 (1977): 1-57. Print. ftp://eclogite.geo.umass.edu/pub/stategeologist/Products/Geothermal/BoreholeTemperatureData/DataReport19.pdf\""
), CasingDepthDriller = c("", "", ""), CasingPipeDiameter = c("",
"", ""), CasingWeight = c(NA, NA, NA), CasingWeightUnits = c(NA,
NA, NA), CasingThickness = c(NA, NA, NA), DrillingFluid = c(NA,
NA, NA), Salinity = c(NA, NA, NA), MudResisitivity = c(NA,
NA, NA), Density = c(NA, NA, NA), FluidLevel = c(NA, NA,
NA), pH = c(NA, NA, NA), Viscosity = c(NA, NA, NA), FluidLoss = c(NA,
NA, NA), Unnamed..66 = c(NA, NA, NA), BitDiameterCollar = c(72,
48, 42), Unnamed..68 = c(NA, NA, NA), InformationSource = c("Stephen Mabee, MA State Geologist, University of Massachusetts, 611 North Pleasant Street, Amherst MA 01003 413-545-2285",
"Stephen Mabee, MA State Geologist, University of Massachusetts, 611 North Pleasant Street, Amherst MA 01003 413-545-2285",
"Stephen Mabee, MA State Geologist, University of Massachusetts, 611 North Pleasant Street, Amherst MA 01003 413-545-2285"
)), .Names = c("ObservationURI", "WellName", "APINo", "HeaderURI",
"OtherID", "OtherName", "BoreholeName", "Operator", "LeaseOwner",
"LeaseNo", "SpudDate", "EndedDrillingDate", "WellType", "Status",
"CommodityOfInterest", "StatusDate", "Function", "Production",
"Field", "County", "State", "PLSS_Meridians", "TWP", "RGE", "Section_",
"SectionPart", "Parcel", "UTM_E", "UTM_N", "LatDegree", "LongDegree",
"SRS", "LocationUncertaintyStatement", "DrillerTotalDepth", "LengthUnits",
"WellBoreShape", "TrueVerticalDepth", "ElevationGL", "BitDiameterTD",
"DiameterUnits", "Notes", "MeasuredTemperature", "CorrectedTemperature",
"TemperatureUnits", "TimeSinceCirculation", "CirculationDuration",
"MeasurementProcedure", "CorrectionType", "DepthOfMeasurement",
"MeasurementDateTime", "MeasurementFormation", "MeasurementSource",
"CasingLogger", "CasingDepthDriller", "CasingPipeDiameter", "CasingWeight",
"CasingWeightUnits", "CasingThickness", "DrillingFluid", "Salinity",
"MudResisitivity", "Density", "FluidLevel", "pH", "Viscosity",
"FluidLoss", "Unnamed..66", "BitDiameterCollar", "Unnamed..68",
"InformationSource"), row.names = c(NA, 3L), class = "data.frame")`
We can read the datasets in a list with fread and use rbindlist from data.table with fill = TRUE and idcol argument to create a single data.table object. The fill = TRUE ensure that NA elements are created for those datasets that have lesser number of columns.
library(data.table)
#get the files from the working directory
files <- list.files(pattern = ".csv")
#read files in a loop with fread and then rbind the data.tables
rbindlist(lapply(files, fread), fill = TRUE, idcol = "grp")

Improve code efficiency

I've been working on a code that reads in all the sheets of an Excel workbook, where the first two columns in each sheet are "Date" and "Time", and the next two columns are either "Level" and "Temperature, or "LEVEL" and "TEMPERATURE". The code works, but I am working on improving my coding clarity and efficiency, so any advice in those regards would be greatly appreciated.
My function 1) reads in the data to a list of dataframes, 2) gets rid of any NA columns that were accidentally read in, 3) combines "Date" and "Time" to "DateTime" for each dataframe, 4) rounds "DateTime" to the nearest 5 minutes for each dataframe, 5) replaces "Date" and "Time" in each dataframe with "DateTime". I started getting more comfortable with lapply, but am wondering if I can improve the code efficiency at all instead of have so many lines with lapply.
library(readxl)
library(plyr)
read_excel_allsheets <- function(filename) {
sheets <- readxl::excel_sheets(filename)
data <- lapply(sheets, function(X) readxl::read_excel(filename, sheet = X))
names(data) <- sheets
clean <- lapply(data, function(y) y[, colSums(is.na(y)) == 0])
date <- lapply(clean, "[[", 1)
time <- lapply(clean, "[[", 2)
time <- lapply(time, function(z) format(z, format = "%H:%M"))
datetime <- Map(paste, date, time)
datetime <- lapply(datetime, function(a) as.POSIXct(a, format = "%Y-%m-%d %H:%M"))
rounded <- lapply(datetime, function(b) as.POSIXlt(round(as.numeric(b)/(5*60))*(5*60),origin='1970-01-01'))
addDateTime <- mapply(cbind, clean, "DateTime" = rounded, SIMPLIFY = F)
final <- lapply(addDateTime, function(z) z[!(names(z) %in% c("Date", "Time"))])
return(final)
}
Next, I would like to plot all of my data. So, I 1) run my code for a file, 2) combine the list of dataframes into one dataframe while maintaining an "ID" for each dataframe as a column, 3) combine the lowercase and uppercase versions of the variable columns, 4) add two new columns that split the "ID". Each ID is something like B1CC or B2CO, where I want to split the "ID" like so: "B1" and "CC". Now I can use ggplot very easily.
mysheets <- read_excel_allsheets(filename)
df = ldply(mysheets)
df$Temp <- rowSums(df[, c("Temperature", "TEMPERATURE")], na.rm = T)
df$Lev <- rowSums(df[, c("Level", "LEVEL")], na.rm = T)
df <- df[!names(df) %in% c("Level", "LEVEL", "Temperature", "TEMPERATURE")]
df$exp <- gsub("^[[:alnum:]]{2}", "\\1",df$.id)
df$plot <- gsub("[[:alnum:]]{2}$", "\\1", df$.id)
Here are the data for the first two dataframes, but there are over 50 of them, and each is relatively big, and there are many files to read. Therefore, I'm looking to improve efficiency (in terms of time to run) where I can. Any help or advice is greatly appreciated!
dput(head(x[[1]]))
structure(list(Date = structure(c(1305504000, 1305504000, 1305504000,
1305504000, 1305504000, 1305504000), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), Time = structure(c(-2209121912, -2209121612,
-2209121312, -2209121012, -2209120712, -2209120412), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), Level = c(106.9038, 106.9059, 106.89,
106.9121, 106.8522, 106.8813), Temperature = c(6.176, 6.173,
6.172, 6.168, 6.166, 6.165)), .Names = c("Date", "Time", "Level",
"Temperature"), row.names = c(NA, 6L), class = c("tbl_df", "tbl",
"data.frame"))
dput(head(x[[2]]))
structure(list(Date = structure(c(1305504000, 1305504000, 1305504000,
1305504000, 1305504000, 1305504000), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), Time = structure(c(-2209121988, -2209121688,
-2209121388, -2209121088, -2209120788, -2209120488), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), LEVEL = c(117.5149, 117.511, 117.5031,
117.5272, 117.4523, 117.4524), TEMPERATURE = c(5.661, 5.651,
5.645, 5.644, 5.644, 5.645), `NA` = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), `NA` = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), `NA` = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), `NA` = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), `NA` = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_)), .Names = c("Date", "Time", "LEVEL",
"TEMPERATURE", NA, NA, NA, NA, NA), row.names = c(NA, 6L), class =
c("tbl_df", "tbl", "data.frame"))

Resources