I have some data from event producer. In a "created_at column I have mixed type of datetime value.
Some NA, some ISO8601 like, some POSIX with and without millisec.
I build a func that should take care of everything meanning let's NA and ISO8601 info as it is, and convert POSIX date to ISO8601.
library(anytime)
convert_time <- function(x) {
nb_char = nchar(x)
if (is.na(x)) return(x)
else if (nb_char == 10 | nb_char == 13) {
num_x = as.numeric(x)
if (nb_char == 13) {
num_x = round(num_x / 1000, 0)
}
return(anytime(num_x))
}
return(x)
}
If I passe one problematic value
convert_time("1613488656")
"2021-02-16 15:17:36 UTC"
Works well !
Now
df_offer2$created_at = df_offer2$created_at %>% sapply(convert_time)
I still have the problematic values.
Any tips here ?
I would suggest the following small changes...
convert_time <- function(x) {
nb_char = nchar(x)
if (is.na(x)) return(x)
else if (nb_char == 10 | nb_char == 13) {
num_x = as.numeric(x)
if (nb_char == 13) {
num_x = round(num_x / 1000, 0)
}
return(num_x) #remove anytime from here
}
return(x)
}
df_offer2$created_at = df_offer2$created_at %>%
sapply(convert_time) %>% anytime() #put it back in at this point
Two things that have worked for me:
col1<-seq(from=1,to=10)
col2<-rep("1613488656",10)
df <- data.frame(cbind(col1,col2))
colnames(df)<-c("index","created_at")
df <- df%>%
mutate(converted = convert_time(df$created_at))`
alternatively
col1<-seq(from=1,to=10)
col2<-rep("1613488656",10)
df <- data.frame(cbind(col1,col2))
colnames(df)<-c("index","created_at")
df$created_at <- convert_time(df$created_at)
Both spit out warnings but appear to make the correction properly
Related
I want to create a function that takes a character value x. If the value starts with the letter "A", the function should return "Apple", and so forth.
test_df <- function(x){
if (input 'a75'){ "apple75" }
else if (input "d21"){ "dragonfruit21" }
}
Please check this below code
code
library(stringr)
test_df <- function(x){
if (str_detect(x,'^a')) {
return('apple75')
} else if (str_detect(x,'^d')) {
return('dragonfruit75')
} else {
return('no input match')
}
}
test_df('a75')
output
[1] "apple75"
You can try something like this:
test_df <- function(x){
x <- unlist(strsplit(x, ""))
string <- if(x[1] == "a"){
paste0("apple", paste0(x[-1], collapse = ""))
} else{
if(x[1] == "d") {
paste0("dragonfruit", paste0(x[-1], collapse = ""))
}
}
return(string)
}
> test_df('a75')
[1] "apple75"
> test_df('d21')
[1] "dragonfruit21"
You can even use Vectorize and give a vector as input:
test_df_v <- Vectorize(test_df)
test_df_v(c('a75','d21', 'aZ45'))
a75 d21 aZ45
"apple75" "dragonfruit21" "appleZ45"
Now you have a starting point, anything else you want o add, then would be easier from now on.
test_fun <- function(x){
x <- tolower(x) # Ensure the input is all lowercase
if(startsWith(x, 'a')) "apple75"
else if (startsWith(x, 'd')) "dragonfruit21"
}
test_fun('A')
[1] "apple75"
One way that might be easier to keep track of the options would be to create a lookup table within the function and select from that:
startsWith <- function(x){
firstletter = substr(tolower(x), 0,1)
lookup = data.frame(firstletter = c("a", "b", "c", "d"),
associated = c("apple75", "Banana69", "Carrot18", "dragonfruit21")
)
return(lookup$associated[which(lookup$firstletter==firstletter)])
}
startsWith("apple")
#[1] "apple75"
startsWith("drnking")
#[1] "dragonfruit21"
I'm trying to quickly replace multiple characters in a string with another character such as *
For example, I have a string such as:
string = "abcdefghij"
I also have a vector of indexes that indicate where I would like to replace letters in the above string with another character.
string_indexes_replaced = c(1, 4, 6, 9)
Desired output:
"*bc*e*gh*j"
What I've done
I've tried a very novice like approach of splitting the characters up into a list, replacing the characters with *, then collapsing the list back into the desired string, as shown below:
library(dplyr)
library(stringi)
string%>%
strsplit(split = "")%>%
lapply(function(x) replace(x, string_indexes_replaced, rep("*", length(string_indexes_replaced))))%>%
lapply(stri_flatten)%>%
unlist(use.names = FALSE)
which outputs
"*bc*e*gh*j"
but it is clear that there should be something simpler and faster than what I've posted above. Is there anything simpler & quicker than what I've demonstrated here?
in base R, besides the method of substring() and for-loop shown by #akrun,, you can use utf8ToInt() and intToUtf8 to make it
v <- utf8ToInt(string)
v[string_indexes_replaced ] <- utf8ToInt("*")
res <- intToUtf8(v)
which gives
> res
[1] "*bc*e*gh*j"
We can use substring
v1 <- c(1, 4, 6, 9)
for(i in seq_along(v1)) substring(string, v1[i], v1[i]) <- "*"
#[1] "*bc*e*gh*j"
As we are using stringi, another option is
library(stringi)
stri_sub_all(string, from = v1, length = 1) <- "*"
string
#[1] "*bc*e*gh*j"
A simple recursive solution. The time efficiency should be same as iteration (for loop). The benefit is there is no side-effect (assignment of integer ks is localized), so that we can treat its whole computation as a functional abstract and feed it to other part of the bigger program which we are working on. It will help to modularize the code.
# multi-replace for character vector input with length greater than 1
multi_replace_v <- function(v, r, ks) {
ks <- as.integer(ks)
if (length(ks) == 0) {
v
} else if (length(ks) == 1) {
if (ks[[1]] > length(v) | ks[[1]] < 1) {
stop("Invalid parameter: ks=", as.character(ks[[1]]), ". Valid range: 1-", as.character(length(v)))
} else if (ks[[1]] == 1) {
c(r, v[-1])
} else if (ks[[1]] == length(v)) {
c(v[-length(v)], r)
} else {
c(v[1:(ks[[1]]-1)], r, v[(ks[[1]]+1):length(v)])
}
} else {
multi_replace_v(multi_replace_v(v, r, ks[[1]]), r, ks[-1])
}
}
# multi-replace for input of single string character vector
multi_replace_s <- function(s, r, ks) paste0(multi_replace_v(unlist(strsplit(s, '')), r, ks), collapse = '')
# multi-replace for both single string and long vector input
multi_replace <- function(v_or_s, r, ks) {
if (length(v_or_s) == 1) {
multi_replace_s(v_or_s, r, ks)
} else if (length(v_or_s) > 1) {
multi_replace_v(v_or_s, r, ks)
} else {
NULL
}
}
# Example
> multi_replace('abcdefghij', "*", c(1,4,6,9))
[1] "*bc*e*gh*j"
convertToClockTime <- function(file, lag = Latency) {
colnames(adamcorrectfile)[which(colnames(adamcorrectfile) == "X.1")] <- "Calculated.Run.Time"
adamcorrectfile$Calculated.Run.Time <- round(adamcorrectfile$Calculated.Run.Time, digits = 0)
adamcorrectfile$LPRS.Time <- as.POSIXct(adamcorrectfile$LPRS.Time, format = "%H:%M")
adamcorrectfile <- adamcorrectfile[order(adamcorrectfile$LPRS.Time),]
output <- colnames(adamcorrectfile)
for (j in unique(adamcorrectfile$Folder)) {
adamcorrectfile.Folder <- adamcorrectfile[which(adamcorrectfile$Folder == "print 1"),]
adamcorrectfile.Folder$start.time <- adamcorrectfile.Folder$LPRS.Time + lag
adamcorrectfile.Folder$end.time <- adamcorrectfile.Folder$start.time + adamcorrectfile.Folder$`Calculated.Run.Time`
for (i in 2:nrow(adamcorrectfile)) {
adamcorrectfile.Folder[i,"start.time"] <- max(adamcorrectfile.Folder[i,"LPRS.Time"] + 15*60, adamcorrectfile[i-1, "end.time"]
adamcorrectfile.Folder[i, "end.time"] <- adamcorrectfile.Folder[i,"start.time"] + adamcorrectfile.Folder[i,"Calculated.Run.Time"]
}
output <- rbind(output, adamcorrectfile.Folder)
return(output)
}
}
On line 1 it says unmatched opening bracket '{' even though it is matched. Any help. Are there some tricky things with R indentation, it just keeps popping errors due to indentations it seems
It looks like the problem is that you haven't closed the parentheses on the max function in the middle of the for loop.
Here is the working code:
convertToClockTime <- function(file, lag = Latency) {
colnames(adamcorrectfile)[which(colnames(adamcorrectfile) == "X.1")] <- "Calculated.Run.Time"
adamcorrectfile$Calculated.Run.Time <- round(adamcorrectfile$Calculated.Run.Time, digits = 0)
adamcorrectfile$LPRS.Time <- as.POSIXct(adamcorrectfile$LPRS.Time, format = "%H:%M")
adamcorrectfile <- adamcorrectfile[order(adamcorrectfile$LPRS.Time),]
output <- colnames(adamcorrectfile)
for (j in unique(adamcorrectfile$Folder)) {
adamcorrectfile.Folder <- adamcorrectfile[which(adamcorrectfile$Folder == "print 1"),]
adamcorrectfile.Folder$start.time <- adamcorrectfile.Folder$LPRS.Time + lag
adamcorrectfile.Folder$end.time <- adamcorrectfile.Folder$start.time + adamcorrectfile.Folder$`Calculated.Run.Time`
for (i in 2:nrow(adamcorrectfile)) {
adamcorrectfile.Folder[i,"start.time"] <- max(adamcorrectfile.Folder[i,"LPRS.Time"] + 15*60, adamcorrectfile[i-1, "end.time"])
adamcorrectfile.Folder[i, "end.time"] <- adamcorrectfile.Folder[i,"start.time"] + adamcorrectfile.Folder[i,"Calculated.Run.Time"]
}
output <- rbind(output, adamcorrectfile.Folder)
return(output)
}
}
To find the closest date, I have:
closestDate <- function(searchDate, dateList, roundDown=FALSE) {
as.Date(sapply(as.Date(searchDate), function(x){
dist <- abs(x - as.Date(dateList))
closest <- dateList[which(min(dist) == dist)]
return(ifelse(roundDown, min(closest), max(closest)))
}), origin="1970-1-1")
}
When:
> nonNAdays
[1] "2011-08-15" "2011-08-18" "2011-08-19"
I get:
> closestDate('2011-08-15', nonNAdays)
[1] "2011-08-15"
I would like for the function to give me the closest date other than the date itself. So in this case, "2011-08-18". How can I alter my code to get this?
Thanks.
Just remove the dates that are equal from the dist calculations and from the selection operation:
closestDate <- function(searchDate, dateList, roundDown=FALSE) {
as.Date(sapply(as.Date(searchDate), function(x){
dist <- abs(x - as.Date(dateList[dateList != searchDate]))
closest <- dateList[dateList != searchDate][which(min(dist) == dist)]
return(ifelse(roundDown, min(closest), max(closest)))
}), origin="1970-1-1")
}
nonNAdays <- c("2011-08-15", "2011-08-18", "2011-08-19")
closestDate('2011-08-15', nonNAdays)
#[1] "2011-08-18"
D <- "06.12.1948" # which is dd.mm.yyyy
as.Date(D, "%d.%m.%y") # convert to date
[1] "2019-12-06" # ????
what is it that I am missing?
Sys.getlocale(category = "LC_ALL")
[1] "LC_COLLATE=German_Austria.1252;LC_CTYPE=German_Austria.1252;LC_MONETARY=German_Austria.1252;LC_NUMERIC=C;LC_TIME=German_Austria.1252"
The format is case-sensitive ("%y" is ambiguous and system dependent, I believe):
as.Date(D, "%d.%m.%Y")
[1] "1948-12-06"
The help topic ?strptime has details:
‘%y’ Year without century (00-99). On input, values 00 to 68 are
prefixed by 20 and 69 to 99 by 19 - that is the behaviour
specified by the 2004 and 2008 POSIX standards, but they do
also say ‘it is expected that in a future version the default
century inferred from a 2-digit year will change’.
To avoid remembering formats of the date we can use packaged solutions.
1) With lubridate
lubridate::dmy(D)
#[1] "1948-12-06"
2) Using anytime
anytime::anydate(D)
#[1] "1948-06-12"
Might be helpful for someone. I have found this function in tutorial "Handling date-times in R" by Cole Beck. The function identifies format of your data.
# FUNCTION guessDateFormat #x vector of character dates/datetimes #returnDates return
# actual dates rather than format convert character datetime to POSIXlt datetime, or
# at least guess the format such that you could convert to datetime
guessDateFormat <- function(x, returnDates = FALSE, tzone = "") {
x1 <- x
# replace blanks with NA and remove
x1[x1 == ""] <- NA
x1 <- x1[!is.na(x1)]
if (length(x1) == 0)
return(NA)
# if it's already a time variable, set it to character
if ("POSIXt" %in% class(x1[1])) {
x1 <- as.character(x1)
}
dateTimes <- do.call(rbind, strsplit(x1, " "))
for (i in ncol(dateTimes)) {
dateTimes[dateTimes[, i] == "NA"] <- NA
}
# assume the time part can be found with a colon
timePart <- which(apply(dateTimes, MARGIN = 2, FUN = function(i) {
any(grepl(":", i))
}))
# everything not in the timePart should be in the datePart
datePart <- setdiff(seq(ncol(dateTimes)), timePart)
# should have 0 or 1 timeParts and exactly one dateParts
if (length(timePart) > 1 || length(datePart) != 1)
stop("cannot parse your time variable")
timeFormat <- NA
if (length(timePart)) {
# find maximum number of colons in the timePart column
ncolons <- max(nchar(gsub("[^:]", "", na.omit(dateTimes[, timePart]))))
if (ncolons == 1) {
timeFormat <- "%H:%M"
} else if (ncolons == 2) {
timeFormat <- "%H:%M:%S"
} else stop("timePart should have 1 or 2 colons")
}
# remove all non-numeric values
dates <- gsub("[^0-9]", "", na.omit(dateTimes[, datePart]))
# sep is any non-numeric value found, hopefully / or -
sep <- unique(na.omit(substr(gsub("[0-9]", "", dateTimes[, datePart]), 1, 1)))
if (length(sep) > 1)
stop("too many seperators in datePart")
# maximum number of characters found in the date part
dlen <- max(nchar(dates))
dateFormat <- NA
# when six, expect the century to be omitted
if (dlen == 6) {
if (sum(is.na(as.Date(dates, format = "%y%m%d"))) == 0) {
dateFormat <- paste("%y", "%m", "%d", sep = sep)
} else if (sum(is.na(as.Date(dates, format = "%m%d%y"))) == 0) {
dateFormat <- paste("%m", "%d", "%y", sep = sep)
} else stop("datePart format [six characters] is inconsistent")
}else if (dlen == 8) {
if (sum(is.na(as.Date(dates, format = "%Y%m%d"))) == 0) {
dateFormat <- paste("%Y", "%m", "%d", sep = sep)
} else if (sum(is.na(as.Date(dates, format = "%m%d%Y"))) == 0) {
dateFormat <- paste("%m", "%d", "%Y", sep = sep)
} else stop("datePart format [eight characters] is inconsistent")
} else {
stop(sprintf("datePart has unusual length: %s", dlen))
}
if (is.na(timeFormat)) {
format <- dateFormat
} else if (timePart == 1) {
format <- paste(timeFormat, dateFormat)
} else if (timePart == 2) {
format <- paste(dateFormat, timeFormat)
} else stop("cannot parse your time variable")
if (returnDates)
return(as.POSIXlt(x, format = format, tz = tzone))
format
}
# generate some dates
mydates <- format(as.POSIXct(sample(31536000, 20), origin = "2011-01-01", tz = "UTC"), "%m.%d.%Y %H:%M")
mydates
## [1] "02/07/2011 06:51" "11/21/2011 17:03" "09/17/2011 22:42" "02/16/2011 13:45"
## [5] "12/14/2011 19:11" "09/08/2011 09:22" "12/06/2011 14:06" "02/02/2011 11:00"
## [9] "03/27/2011 06:12" "01/05/2011 15:09" "04/15/2011 04:17" "10/20/2011 14:20"
## [13] "11/13/2011 21:46" "02/26/2011 03:24" "12/29/2011 11:02" "03/17/2011 02:24"
## [17] "02/27/2011 13:51" "06/27/2011 08:36" "03/14/2011 10:54" "01/28/2011 14:14"
guessDateFormat(mydates)
[1] "%m.%d.%Y %H:%M"
Lubridate is the best option for this in my opinion. The following will work fine.
`data %>% mutate(date_variable = as.Date(dmy(date_variable)))`
Interestingly though I found dmy() to behave weirdly when as.Date() and dmy() were called in separate steps