Convert days to calendar dates within a data frame in R - r

I have a dataframe like
ID |TRTSDT| TRTEDT
101|17952 | 18037
102|17956 | 18041
How can i convert the days into Date format...Thank you

Try
df1[-1] <- lapply(df1[-1], as.Date, origin='1970-01-01')
data
df1 <- structure(list(ID = 101:102, TRTSDT = c(17952L, 17956L),
TRTEDT = c(18037L,
18041L)), .Names = c("ID", "TRTSDT", "TRTEDT"), class = "data.frame",
row.names = c(NA, -2L))

Related

How to fill dataframe in R with months and NA values

This is my dataframe:
df <- structure(list(month_date = structure(c(19117, 19149, 19180,
19212, 19244, 19275), class = "Date"), Values = c(9693, 10227,
10742, 11672, 10565, 10080)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
I need to increse the column month_date until "2023-12-01" with "NA" values.
The output should be a datframe with months until "2023-12-01" and on the Values column filled by "NA" values starting on "2022-11-01".
How can I do this?
library(tidyr)
complete(df, month_date = seq(min(month_date), as.Date("2023-12-01"),
by = '1 day'))
You can also create a separate dataframe/tibble if for some reason you do not want to use tidyr()
add <- data.frame(month_date = seq.Date(as.Date("2022-11-01"), as.Date("2023-12-01"), by = "month"), Values = NA)
final <- rbind(df, add)

use dplyr to get list items from dataframe in R

I have a dataframe being returned from Microsoft365R:
SKA_student <- structure(list(name = "Computing SKA 2021-22.xlsx", size = 22266L,
lastModifiedBy =
structure(list(user =
structure(list(email = "my#email.com",
id = "8ae50289-d7af-4779-91dc-e4638421f422",
displayName = "Name, My"), class = "data.frame", row.names = c(NA, -1L))),
class = "data.frame", row.names = c(NA, -1L)),
fileSystemInfo = structure(list(
createdDateTime = "2021-09-08T16:03:38Z",
lastModifiedDateTime = "2021-09-16T00:09:04Z"), class = "data.frame", row.names = c(NA,-1L))), row.names = c(NA, -1L), class = "data.frame")
I can return all the lastModifiedBy data through:
SKA_student %>% select(lastModifiedBy)
lastModifiedBy.user.email lastModifiedBy.user.id lastModifiedBy.user.displayName
1 my#email.com 8ae50289-d7af-4779-91dc-e4638421f422 Name, My
But if I want a specific item in the lastModifiedBy list, it doesn't work, e.g.:
SKA_student %>% select(lastModifiedBy.user.email)
Error: Can't subset columns that don't exist.
x Column `lastModifiedBy.user.email` doesn't exist.
I can get this working through base, but would really like a dplyr answer
This function allows you to flatten all the list columns (I found this ages ago on SO but can't find the original post for credit)
SO_flat_cols <- function(data) {
ListCols <- sapply(data, is.list)
cbind(data[!ListCols], t(apply(data[ListCols], 1, unlist)))
}
Then you can select as you like.
SO_flat_cols (SKA_student) %>%
select(lastModifiedBy.user.email)
Alternatively you can get to the end by recursively pulling the lists
SKA_student %>%
pull(lastModifiedBy) %>%
pull(user) %>%
select(email)
You could use
library(dplyr)
library(tidyr)
SKA_student %>%
unnest_wider(lastModifiedBy) %>%
select(email)
This returns
# A tibble: 1 x 1
email
<chr>
1 my#email.com

How to plot layers of tupples on same plot in R?

I am trying to plot the time and NDVI for each region on the same plot. I think to do this I have to convert the date column from characters to time and then plot each layer. However I cannot figure out how to do this. Any thoughts?
list(structure(list(observation = 1L, HRpcode = NA_character_,
timeseries = NA_character_), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(observation = 1:6, time = c("2014-01-01",
"2014-02-01", "2014-03-01", "2014-04-01", "2014-05-01", "2014-06-01"
), ` NDVI` = c("0.3793765496776215", "0.21686891782421552", "0.3785652933528299",
"0.41027240624704164", "0.4035578030242673", "0.341299793064468"
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
)), structure(list(observation = 1:6, time = c("2014-01-01",
"2014-02-01", "2014-03-01", "2014-04-01", "2014-05-01", "2014-06-01"
), ` NDVI` = c("0.4071076986818826", "0.09090719657570319", "0.35214166081795284",
"0.4444311032927228", "0.5220702877666005", "0.5732370503295022"
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
)), structure(list(observation = 1:6, time = c("2014-01-01",
"2014-02-01", "2014-03-01", "2014-04-01", "2014-05-01", "2014-06-01"
), ` NDVI` = c("0.3412131556625801", "0.18815996897460135", "0.5218904976415136",
"0.6970128777711452", "0.7229657162729096", "0.535967435470161"
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
)))
111
First we need to clean your data. The first element in this list is empty
df = df[-1]
Now we need to make a data.frame
df = do.call(rbind, df)
I am going to add a region variable, change the name of NDVI to remove the space,
change ndvi into a numeric vector, and change time into a Date object
library(dplyr)
df = df %>%
mutate(region = factor(rep(1:3, rep(6, 3)))) %>%
rename(ndvi = ' NDVI') %>%
mutate(ndvi = as.numeric(ndvi)) %>%
mutate(time = as.Date(time))
Now we can use ggplot2 to plot the data by region
library(ggplot2)
g = df %>%
ggplot(aes(x = time, y = ndvi, col = region)) +
geom_line()
g
Which gives this plot:
Here's an approach with lubridate to handle dates and dplyr to make the binding of the data.frames easier to understand.
Note that the group names are taken from the names of the list, and since those don't exist in the data you provided, we have to set them in advance.
library(lubridate)
library(ggplot2)
library(dplyr)
names(data) <- 1:3
data <- bind_rows(data, .id = "group")
data$time <- ymd(data$time)
setnames(data," NDVI","NDVI")
data$NDVI <- as.numeric(data$NDVI)
ggplot(data, aes(x=time,y=NDVI,color=Group)) + geom_line()

Replace a string from lookuptable in R

I have a txt file with a list:
name
Test_123
run_456
Test_789
I have another lookuptable that contains the "ID" and gives me a "Plate"
ID plate
123 xxx
456 zzz
789 bbb
Would love to get here
Test_xxx
run_zzz
Test_bbb
My current code does not work entirely.
Either getting <NA> as I guess it looks for values and not for a string or errors.
Thanks so much for your help!
B
A tidyverse way to do this would be:
library(tidyverse)
df1 %>%
separate(name, c("name", "ID"), convert=TRUE) %>%
left_join(df2, by="ID") %>%
mutate(new_name = paste(name, plate, sep="_"))
Using:
df1 <- structure(list(name = c("Test_123", "run_456", "Test_789")),
.Names = "name", class = "data.frame", row.names = c(NA, -3L))
df2 <- structure(list(ID = c(123L, 456L, 789L), plate = c("xxx", "zzz",
"bbb")), .Names = c("ID", "plate"), class = "data.frame", row.names = c(NA,
-3L))
Note that:
separate(..., convert=TRUE) use some heuristics to convert character into integer. You can otherwise do this manually: mutate(ID=as.integer(ID))
You could use unite() (which does the opposite of separate()) instead of mutate(new_name = paste(name, plate, sep="_")), which would also remove the previous columns
An option would be gsubfn
library(gsubfn)
gsubfn("(\\d+)", setNames(as.list(df2$plate), df2$ID), df1$name)
#[1] "Test_xxx" "run_zzz" "Test_bbb"
data
df1 <- structure(list(name = c("Test_123", "run_456", "Test_789")),
.Names = "name", class = "data.frame", row.names = c(NA, -3L))
df2 <- structure(list(ID = c(123L, 456L, 789L), plate = c("xxx", "zzz",
"bbb")), .Names = c("ID", "plate"), class = "data.frame", row.names = c(NA,
-3L))
For a base R option, you could add a new column to your first data frame with the exact join data:
df1$ID <- sub(".*_(?=[0-9]+)", "", df1$name, perl=TRUE)
df1$start <- sub("_[0-9]+", "", df1$name)
Then, use merge:
result <- merge(df1, df2, by="ID")
And finally create your desired output column:
result$out <- paste0(result$start, "_", result$plate)
result$out
[1] "Test_xxx" "run_zzz" "Test_bbb"
Data:
df1 <- data.frame(name=c("Test_123", "run_456", "Test_789"), stringsAsFactors=FALSE)
df2 <- data.frame(ID=c("123", "456", "789"),
plate=c("xxx", "zzz", "bbb"), stringsAsFactors=FALSE)
Demo

lapply date format to many dataframes

I'm having difficulties cycling through a list and apply the same format to the same variable in many data frames:
df1 <- structure(list(datetime = structure(c(1446336120, 1446336180,
1446336240, 1446336300, 1446336360), class = c("POSIXct", "POSIXt"
), tzone = "UTC")), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame"), .Names = "datetime")
df2 <- structure(list(datetime = structure(c(1446336120, 1446336180,
1446336240), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame"), .Names = "datetime")
I want to apply the same format to all dataframes:
df1$datetime <- format(df1$datetime, "%m/%d/%Y %H:%M:%S")
df2$datetime <- format(df2$datetime, "%m/%d/%Y %H:%M:%S")
I tried this:
list_df <- mget(ls(pattern = "df"))
lapply(seq_along(list_df),
function(i) format(list_df[[i]]$datetime, "%m/%d/%Y %H:%M:%S"))
but not sure how to assign it back to each dataframe.
I think your current approach is not far off, but you never make an assignment back to the data frame. Add each data frame to a list and then use lapply:
lst <- list(df1, df2)
lapply(lst, function(x) {
x$datetime <- format(x$datetime, "%m/%d/%Y %H:%M:%S")
return(x)
})
At this point you have a list of data frames in the format you want. If you then later wanted to export each data frame to a CSV file, you could try this:
for (i in 1:length(lst)) {
filename <- paste0("out", i, ".csv")
write.csv(lst[[i]], file=filename)
}
library(dplyr)
lapply(list_df, function(x) x %>%
mutate(datetime=format(datetime, "%m/%d/%Y %H:%M:%S")))

Resources