Adding the Century to 2-Digit Year - r

I currently have a df that looks like
STA YR MO DA MAX date
58716 33013 43 3 11 60 0043-03-11
58717 33013 43 3 12 55 0043-03-12
58718 33013 43 3 13 63 0043-03-13
58719 33013 43 3 14 50 0043-03-14
58720 33013 43 3 15 58 0043-03-15
58721 33013 43 3 16 63 0043-03-16
I did df$date <- as.Date(with(df, paste(YR, MO, DA,sep="-")), "%Y-%m-%d")as you can see to get the date column, but clearly because there's no '19' in front of the year column, the year in the date comes out wacky. These are all 19xx dates. What would be a good way to fix this?

Try
df$date <- as.Date(with(df, paste(1900+YR, MO, DA,sep="-")), "%Y-%m-%d")

You should use %y since you have two digit year.
df$date <- as.Date(with(df, paste(YR, MO, DA,sep="-")), "%y-%m-%d")
However, this doesn't solve your problem since anything less than 69 is prefixed with 20 in 2 digit-years so 43 becomes 2043.
If you know that all your years are in the form of 19XX, you can do
df$date <- as.Date(with(df, sprintf('19%d-%d-%d', YR, MO, DA)))

If your years contain a mixture of 2-digit years from more than one century, then this code converts them all into valid dates in the past (no future dates).
dates_y2Y <- function(y,m,d) {
library(stringr)
y <- stringr::str_pad(y, width=2, pad="0")
m <- stringr::str_pad(m, width=2, pad="0")
d <- stringr::str_pad(d, width=2, pad="0")
toyear <- format(Sys.Date(), "%y")
tomnth <- format(Sys.Date(), "%m")
today <- format(Sys.Date(), "%d")
as.Date(
ifelse(y<toyear | y==toyear & m<tomnth | y==toyear & m==tomnth & d<=today,
as.Date(paste(y,m,d,sep="-"), format="%y-%m-%d"),
as.Date(paste(paste0("19",y),m,d,sep="-"), format="%Y-%m-%d"))
, origin="1970-01-01")
}
df$date <- dates_y2Y(df$YR, df$MO, df$DA)
df
STA YR MO DA date
1 33013 23 1 31 1923-01-31
2 33013 43 2 30 <NA>
3 33013 63 5 5 1963-05-05
4 33013 83 7 27 1983-07-27
5 33013 3 12 9 2003-12-09
6 33013 20 4 21 2020-04-21
7 33013 20 4 22 1920-04-22
Data:
df <- structure(list(STA = c(33013L, 33013L, 33013L, 33013L, 33013L,
33013L, 33013L), YR = c(23L, 43L, 63L, 83L, 3L, 20L, 20L), MO = c(1L,
2L, 5L, 7L, 12L, 4L, 4L), DA = c(31L, 30L, 5L, 27L, 9L, 21L,
22L), date = structure(c(-17137, NA, -2433, 4955, 12395, 18373,
-18151), class = "Date")), row.names = c(NA, -7L), class = "data.frame")

another solution
library(lubridate)
df %>%
mutate(date = make_date(year = 1900 + YR, month = MO, day = DA))

Another option with sprintf
df$date <- as.Date(do.call(sprintf, c(f = '19%d-%d-%d', df[2:4])))
Or with unite
library(dplyr)
library(tidyr)
library(stringr)
df %>%
mutate(YR = str_c('19', YR)) %>%
unite(date, YR, MO, DA, sep="-", remove = FALSE) %>%
mutate(date = as.Date(date))

Related

How can I merge different data sets in R knowing that the variable that I use for matching the two data set are not unique?

I have two datasets, and I need to merge them by the ID value. The problems are:
The ID value can be repeated across the same dataset (no other unique value is available).
The two datasets are not equal in the rows number or the column numbers.
Example:
df1
ID
Gender
99
Male
85
Female
7
Male
df2
ID
Body_Temperature
Body_Temperature_date_time
99
36
1/1/2020 12:00 am
99
38
2/1/2020 10:30 am
99
37
1/1/2020 06:41 am
52
38
1/2/2020 11:00 am
11
39
4/5/2020 09:09 pm
7
35
9/8/2020 02:30 am
How can I turn these two datasets into one single dataset in a way that allows me to apply some machine learning models on it later on?
Depending on your expected results, if you are wanting to return all rows from each dataframe, then you can use a full_join from dplyr:
library(dplyr)
full_join(df2, df1, by = "ID")
Or with base R:
merge(x=df2,y=df1,by="ID",all=TRUE)
Output
ID Body_Temperature Body_Temperature_date_time Gender
1 99 36 1/1/2020 12:00 am Male
2 99 38 2/1/2020 10:30 am Male
3 99 37 1/1/2020 06:41 am Male
4 52 38 1/2/2020 11:00 am <NA>
5 11 39 4/5/2020 09:09 pm <NA>
6 7 35 9/8/2020 02:30 am Male
7 85 NA <NA> Female
If you have more than 2 dataframes to combine, which only overlap with the ID column, then you can use reduce on a dataframe list (so put all the dataframes that you want to combine into a list):
library(tidyverse)
df_list <- list(df1, df2)
multi_full <- reduce(df_list, function(x, y, ...)
full_join(x, y, by = "ID", ...))
Or Reduce with base R:
df_list <- list(df1, df2)
multi_full <- Reduce(function(x, y, ...)
merge(x, y, by = "ID", all = TRUE, ...), df_list)
Data
df1 <- structure(list(ID = c(99L, 85L, 7L), Gender = c("Male", "Female",
"Male")), class = "data.frame", row.names = c(NA, -3L))
df2 <- structure(list(ID = c(99L, 99L, 99L, 52L, 11L, 7L), Body_Temperature = c(36L,
38L, 37L, 38L, 39L, 35L), Body_Temperature_date_time = c("1/1/2020 12:00 am",
"2/1/2020 10:30 am", "1/1/2020 06:41 am", "1/2/2020 11:00 am",
"4/5/2020 09:09 pm", "9/8/2020 02:30 am")), class = "data.frame", row.names = c(NA,
-6L))

R: Compute monthly averages for daily values

I have the following data which is stored as a data.frame in R:
Daily value of product A, B and C from 2018-08-01 until 2019-12-31
Now I would like to compute the monthly average of the value for each product. Additionally, only data for the weekdays but not the weekends should be used to calculate the monthly average for each product. What would be the approach in R to get to the required data?
Here is a solution, using dplyr and tidyr:
df <- data.frame(Product = c("A", "B", "C"), "Value_2018-08-01" = c(120L, 100L, 90L),
"Value_2018-08-02" = c(80L, 140L, 20L), "Value_2018-08-03" = c(50L, 70L, 200L),
"Value_2018-12-31" = c(50L, 24L, 24L), "Value_2019-01-01" = c(44L, 60L, 29L),
"Value_2019-12-31" = c(99L, 49L, 49L))
df %>%
tidyr::pivot_longer(c(starts_with("Value"))) %>%
mutate(Date = name,
Date = sub(".*_", "", Date),
Date = as.Date(Date, format="%Y.%m.%d"),
weekday = weekdays(Date)) %>%
filter(!weekday %in% c("Samstag", "Sonntag")) %>%
group_by(Product, format(Date, "%m")) %>%
summarize(mean(value)) %>%
as.data.frame()
Product format(Date, "%m") mean(value)
1 A 01 44.00000
2 A 08 83.33333
3 A 12 74.50000
4 B 01 60.00000
5 B 08 103.33333
6 B 12 36.50000
7 C 01 29.00000
8 C 08 103.33333
9 C 12 36.50000
Note that Samstag and Sonntag should be changed to the names of the weekend days in the language of your working system.
Also, I've calculated the monthly averages as you asked for it. However, if you want to have monthly averages per year, you should change group_by(Product, format(Date, "%m"))to group_by(Product, format(Date, "%m"),format(Date, "%Y")).

Transform to wide format from long in R

I have a data frame in R which looks like below
Model Month Demand Inventory
A Jan 10 20
B Feb 30 40
A Feb 40 60
I want the data frame to look
Jan Feb
A_Demand 10 40
A_Inventory 20 60
A_coverage
B_Demand 30
B_Inventory 40
B_coverage
A_coverage and B_Coverage will be calculated in excel using a formula. But the problem I need help with is to pivot the data frame from wide to long format (original format).
I tried to implement the solution from the linked duplicate but I am still having difficulty:
HD_dcast <- reshape(data,idvar = c("Model","Inventory","Demand"),
timevar = "Month", direction = "wide")
Here is a dput of my data:
data <- structure(list(Model = c("A", "B", "A"), Month = c("Jan", "Feb",
"Feb"), Demand = c(10L, 30L, 40L), Inventory = c(20L, 40L, 60L
)), class = "data.frame", row.names = c(NA, -3L))
Thanks
Here's an approach with dplyr and tidyr, two popular R packages for data manipulation:
library(dplyr)
library(tidyr)
data %>%
mutate(coverage = NA_real_) %>%
pivot_longer(-c(Model,Month), names_to = "Variable") %>%
pivot_wider(id_cols = c(Model, Variable), names_from = Month ) %>%
unite(Variable, c(Model,Variable), sep = "_")
## A tibble: 6 x 3
# Variable Jan Feb
# <chr> <dbl> <dbl>
#1 A_Demand 10 40
#2 A_Inventory 20 60
#3 A_coverage NA NA
#4 B_Demand NA 30
#5 B_Inventory NA 40
#6 B_coverage NA NA

R programming_ Subsetting rows on logic conditions

Sample data:
sampleData
Ozone Solar.R Wind Temp Month Day sampleData.Ozone
1 41 190 7.4 67 5 1 41
2 36 118 8.0 72 5 2 36
3 12 149 12.6 74 5 3 12
.........
Want to extract records on the condition $ozone > 31
Here is the code:
data <- sampleData[sampleData$ozone > 31]
And get the error below:
Error in if (inherits(X[[j]], "data.frame") && ncol(xj) > 1L) X[[j]] <- as.matrix(X[[j]]) :
missing value where TRUE/FALSE needed
How should I correct it? Thanks!
R is case sensitive, so your ozone has to match the name in your data.frame. Also to subset a data.frame, you need two indices (row and column) separated by a comma. If there is nothing after the comma, it means that you are selecting all the columns:
sampleData[sampleData$Ozone > 31,]
Other methods to subset a data.frame:
subset(sampleData, Ozone > 31)
or with dplyr:
library(dplyr)
sampleData %>%
filter(Ozone > 31)
Result:
Ozone Solar.R Wind Temp Month Day sampleData.Ozone
1 41 190 7.4 67 5 1 41
2 36 118 8.0 72 5 2 36
Data:
sampleData = structure(list(Ozone = c(41L, 36L, 12L), Solar.R = c(190L, 118L,
149L), Wind = c(7.4, 8, 12.6), Temp = c(67L, 72L, 74L), Month = c(5L,
5L, 5L), Day = 1:3, sampleData.Ozone = c(41L, 36L, 12L)), .Names = c("Ozone",
"Solar.R", "Wind", "Temp", "Month", "Day", "sampleData.Ozone"
), class = "data.frame", row.names = c("1", "2", "3"))

Average data by month for a given latitude and longitude?

I have a table with the following headers and example data
Lat Long Date Value.
30.497478 -87.880258 01/01/2016 10
30.497478 -87.880258 01/02/2016 15
30.497478 -87.880258 01/05/2016 20
33.284928 -85.803608 01/02/2016 10
33.284928 -85.803608 01/03/2016 15
33.284928 -85.803608 01/05/2016 20
I would like to average the value column on monthly basis for a particular location.
So example output would be
Lat Long Month Avg Value
30.497478 -87.880258 January 15
A solution using dplyr and lubridate.
library(dplyr)
library(lubridate)
dt2 <- dt %>%
mutate(Date = mdy(Date), Month = month(Date)) %>%
group_by(Lat, Long, Month) %>%
summarise(`Avg Value` = mean(Value))
dt2
# A tibble: 2 x 4
# Groups: Lat, Long [?]
Lat Long Month `Avg Value`
<dbl> <dbl> <dbl> <dbl>
1 30.49748 -87.88026 1 15
2 33.28493 -85.80361 1 15
You can try the following, but it first modifies the data frame adding an extra column, Month, using package zoo.
library(zoo)
dat$Month <- as.yearmon(as.Date(dat$Date, "%m/%d/%Y"))
aggregate(Value. ~ Lat + Long + Month, dat, mean)
# Lat Long Month Value.
#1 30.49748 -87.88026 jan 2016 15
#2 33.28493 -85.80361 jan 2016 15
If you don't want to change the original data, make a copy dat2 <- dat and change the copy.
DATA
dat <-
structure(list(Lat = c(30.497478, 30.497478, 30.497478, 33.284928,
33.284928, 33.284928), Long = c(-87.880258, -87.880258, -87.880258,
-85.803608, -85.803608, -85.803608), Date = structure(c(1L, 2L,
4L, 2L, 3L, 4L), .Label = c("01/01/2016", "01/02/2016", "01/03/2016",
"01/05/2016"), class = "factor"), Value. = c(10L, 15L, 20L, 10L,
15L, 20L)), .Names = c("Lat", "Long", "Date", "Value."), class = "data.frame", row.names = c(NA,
-6L))
EDIT.
If you want to compute several statistics, you can define a function that computes them and returns a named vector and call it in aggregate, like the following.
stat <- function(x){
c(Mean = mean(x), Median = median(x), SD = sd(x))
}
agg <- aggregate(Value. ~ Lat + Long + Month, dat, stat)
agg <- cbind(agg[1:3], as.data.frame(agg[[4]]))
agg
# Lat Long Month Mean Median SD
#1 30.49748 -87.88026 jan 2016 15 15 5
#2 33.28493 -85.80361 jan 2016 15 15 5

Resources