Calculate the percentage of missing values per month in a dataframe - r

I create the following dataframe:
df <- data.frame(seq(from = as.Date("2001-01-01"), to = as.Date("2001-12-31"), by = 1),
seq(1,365), seq(1, 365), seq(1, 365), seq(1, 365))
colnames(df) <- c("date", "C1", "C2", "C3", "C4")
df$C1[50:100] <- NA
df$C2[20:80] <- NA
df$C3[70:150] <- NA
df$C4[250:300] <- NA
I would like to calculate the percentage of missing values per month, not just per column but for the whole dataset.
Is there an efficient way to do this?

library(dplyr)
library(lubridate)
#is.na(.) can be more specific e.g. is.na(.[,2:5]) OR is.na(.[,grepl("C",colnames(df))])
df %>% mutate(Month=month(date), Mis = rowSums(is.na(.))) %>%
group_by(Month) %>%
summarise(Sum=sum(Mis), Percentage=mean(Mis))

Related

New column with random boolean values while controlling the ratio of TRUE/FALSE per category

In R I've got a dataset like this one:
df <- data.frame(
ID = c(1:30),
x1 = seq(0, 1, length.out = 30),
x2 = seq(100, 3000, length.out = 30),
category = gl(3, 10, labels = c("NEGATIVE", "NEUTRAL", "POSITIVE"))
)
Now I want to add a new column with randomized boolean values, but inside each category the proportion of TRUE and FALSE values should be the same (i.e. the randomizing process should generate the same count of true and false values, in the above data frame 5 TRUEs and 5 FALSEs in each of the 3 categories). How to do this?
You can sample a vector of "TRUE" and "FALSE" values without replacement so you have a randomized and balanced column in your data-frame.
sample(rep(c("TRUE","FALSE"),each=5),10,replace=FALSE)
Based on Yacine Hajji answer:
addRandomBool <- function(df, p){
n <- ceiling(nrow(df) * p)
df$bool <- sample(rep(c("TRUE","FALSE"), times = c(n, nrow(df) - n)))
df
}
Reduce(rbind, lapply(split(df, df$category), addRandomBool, p = 0.5))
where parametar p determines the proportion of TRUE.
This will sample within each group from a vector of 5 TRUE and 5 FALSE without replacement. It will assume that there are always 10 records per group.
library(dplyr)
library(tidyr)
df <- data.frame(
ID = c(1:30),
x1 = seq(0, 1, length.out = 30),
x2 = seq(100, 3000, length.out = 30),
category = gl(3, 10, labels = c("NEGATIVE", "NEUTRAL", "POSITIVE"))
)
set.seed(pi)
df %>%
group_by(category) %>%
nest() %>%
mutate(data = lapply(data,
function(df){ # Function to saple and assign the new_col
df$new_col <- sample(rep(c(FALSE, TRUE),
each = 5),
size = 10,
replace = FALSE)
df
})) %>%
unnest(cols = "data")
This next example is a little more generalized, but still assumes (approximately) even distribution of TRUE and FALSE within a group. But it can accomodate variable group sizes, and even groups with odd numbers of records (but will favor FALSE for odd numbers of records)
library(dplyr)
library(tidyr)
df <- data.frame(
ID = c(1:30),
x1 = seq(0, 1, length.out = 30),
x2 = seq(100, 3000, length.out = 30),
category = gl(3, 10, labels = c("NEGATIVE", "NEUTRAL", "POSITIVE"))
)
set.seed(pi)
df %>%
group_by(category) %>%
nest() %>%
mutate(data = lapply(data,
function(df){
df$new_col <- sample(rep(c(FALSE, TRUE),
length.out = nrow(df)),
size = nrow(df),
replace = FALSE)
df
})) %>%
unnest(cols = "data")
Maintaining Column Order
A couple of options to maintain the column order:
First, you can save the column order before you do your group_by - nest, and then use select to set the order when you're done.
set.seed(pi)
orig_col <- names(df) # original column order
df %>%
group_by(category) %>%
nest() %>%
mutate(data = lapply(data,
function(df){
df$new_col <- sample(rep(c(FALSE, TRUE),
length.out = nrow(df)),
size = nrow(df),
replace = FALSE)
df
})) %>%
unnest(cols = "data") %>%
select_at(c(orig_col, "new_col")) # Restore the column order
Or you can use a base R solution that doesn't change the column order in the first place
df <- split(df, df["category"])
df <- lapply(df,
function(df){
df$new_col <- sample(rep(c(FALSE, TRUE),
length.out = nrow(df)),
size = nrow(df),
replace = FALSE)
df
})
do.call("rbind", c(df, list(make.row.names = FALSE)))
There are likely a dozen other ways to do this, and probably more efficient ways that I'm not thinking of.

Creating list with the same number of values

I have a data set with a date, ID, and coordinates that I would like to split into seasonal months. For example for winter I have January to winter1, February to winter2, and March to winter3. I have done the same for the summer months.
I would like to filter out the IDs that have all of these months, so that when I split the data by ID and year, I would have identical list lengths.
I wasn't sure how to simulate uneven values for each ID in the sample code below, but in my actual data some IDs only have summer1 and not winter1, while it could be flipped around for summer2 and winter2`.
library(lubridate)
library(tidyverse)
date <- rep_len(seq(dmy("01-01-2010"), dmy("31-12-2013"), by = "days"),1000)
ID <- rep(seq(1, 5), 100)
df <- data.frame(date = date,
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID)
df$month <- month(df$date)
df$year <- year(df$date)
df1 <- df %>%
mutate(season_categ = case_when(month %in% 6 ~ 'summer1',
month %in% 7 ~ 'summer2',
month %in% 8 ~ 'summer3',
month %in% 1 ~ 'winter1',
month %in% 2 ~ 'winter2',
month %in% 3 ~ 'winter3')) %>%
group_by(year, ID )%>%
filter(any(month %in% 6:8) &
any(month %in% 1:3))
summer_list <- df1 %>%
filter(season_categ == "summer1") %>%
group_split(year, ID)
# Renames the names in the list to AnimalID and year
names(summer_list) <- sapply(summer_list,
function(x) paste(x$ID[1],
x$year[1], sep = '_'))
# Creates a list for each year and by ID
winter_list <- df1 %>%
filter(season_categ == "winter1") %>%
group_split(year, ID)
names(winter_list) <- sapply(winter_list,
function(x) paste(x$ID[1],
x$year[1], sep = '_'))
Not sure if that is what you want, but I understood that you would want to get rid of IDs that have less than the 6 months of Q1 and Q3 in any of the years, but you could modify the filter or grouping if that assumption was wrong.
Here is one approach:
library(lubridate)
library(dplyr)
set.seed(12345)
# random sampling of dates with this seed gives no July date for ID 2 in 2010
df <- tibble(
date = sample(seq(dmy("01-01-2010"), dmy("31-12-2013"), by = "days"),
1000, replace = TRUE),
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID = rep(1:5, 200),
month = month(date),
year =year(date)) %>%
arrange(ID, date)
df %>%
filter(month %in% c(1:3, 6:8)) %>%
group_by(ID, year) %>%
mutate(complete = length(unique(month)) == 6) %>%
group_by(ID) %>%
filter(all(complete)) %>%
group_by(ID, year) %>%
group_split()
To me it is not really clear as to what your are looking for. Before you split the data into a list sort the rows by columns
df1<-df1[order(ID,season_categ),]
### Determine which ID's have uneven numbers ###
df1 %>%
group_by(ID) %>%
summarize(month_seq = paste(season_categ , collapse = "_"),
number_of_months = n(season_categ))
#### Remove odd numbers###

How to select the top and low 10% values of a data.frame in R?

Question updated:
I have the DF data.frame that has two variables. I would like to have the high 10% and low 10% of DF data, saved in a different data.frame Below is my sample data set with a desired output. Any comments/suggestions are appreciated.
library(tidyverse)
DF <- data.frame(D1 = 1:30, D2 = 36:65)
Desired output:
DF_High <- data.frame(D1 = c(30,29,28), D2 = c(65,64,63))
DF_Low <- data.frame(D1 = c(1,2,3), D2 = c(36,37,38))
We can use across to get top 10% of values from each column.
library(dplyr)
n <- 0.1 * nrow(DF)
Dfhigh <- DF %>%summarise(across(.fns = ~sort(.x, decreasing = TRUE)[1:n]))
Dflow <- DF %>%summarise(across(.fns = ~sort(.x)[1:n]))
In base R, we can do :
Dfhigh <- sapply(DF, function(x) sort(x, decreasing = TRUE)[1:n])
Dflow <- sapply(DF, function(x) sort(x)[1:n])
Try this approach, based on comments you want to order twice and then extract. Here the code using your data DF:
#Code for high
DF2 <- DF[order(-DF$D1,-DF$D2),]
#Code for low
DF3 <- DF[order(DF$D1,DF$D2),]
#Extract
DFHigh <- head(DF2,0.10*nrow(DF2))
DFLow <- head(DF3,0.10*nrow(DF3))
Update: Here a solution reshaping data:
library(tidyverse)
set.seed(123)
#Data
DF <- data.frame(D1 = runif(500,1,50), D2 = runif(100, 5,60))
#Code
Melted <- DF %>% pivot_longer(everything()) %>%
arrange(name,desc(value)) %>% group_by(name) %>%
mutate(id=1:n()) %>%
pivot_wider(names_from = name,values_from=value)
#Choose high
dfhigh <- head(Melted,0.10*nrow(Melted))
dflow <- tail(Melted,0.10*nrow(Melted))

Creating all possible variable combinations in R

I am having a daily dataset of 4 parameters which I have converted into monthly data using following code
library(zoo)
library(hydroTSM)
library(lubridate)
library(tidyverse)
set.seed(123)
df <- data.frame("date"= seq(from = as.Date("1983-1-1"), to = as.Date("2018-12-31"), by = "day"),
"Parameter1" = runif(length(seq.Date(as.Date("1983-1-1"), as.Date("2018-12-31"), "days")), 15, 35),
"Parameter2" = runif(length(seq.Date(as.Date("1983-1-1"), as.Date("2018-12-31"), "days")), 11, 29),
"Parameter3" = runif(length(seq.Date(as.Date("1983-1-1"), as.Date("2018-12-31"), "days")), 50, 90),
"Parameter4" = runif(length(seq.Date(as.Date("1983-1-1"), as.Date("2018-12-31"), "days")), 0, 27))
Monthly_data <- daily2monthly(df, FUN=mean, na.rm=TRUE)
After that, I have reshaped it to represent each column as month using following code
#Function to convert month abbreviation to a numeric month
mo2Num <- function(x) match(tolower(x), tolower(month.abb))
Monthly_data %>%
dplyr::as_tibble(rownames = "date") %>%
separate("date", c("Month", "Year"), sep = "-", convert = T) %>%
mutate(Month = mo2Num(Month))%>%
tidyr::pivot_longer(cols = -c(Month, Year)) %>%
pivot_wider(names_from = Month, values_from = value, names_prefix = "Mon",
names_sep = "_") %>%
arrange(name)
Now, I want to create parameter combinations like Parameter1 * Parameter2, Parameter1 * Parameter3, Parameter1 * Parameter4, Parameter2 * Parameter3, Parameter2 * Parameter4, Parameter3 * Parameter4 which will be added to the pivoted monthly data as rbind. The new dataframe Parameter1 * Parameter2 means to multiply their monthly values and then rbind to the above result. Likewise for all other above said combinations. How can I achieve this?
You can use this base R approach using combn assuming data is present for all the years for all parameters where df1 is the dataframe from the above output ending with arrange(name).
data <- combn(unique(df1$name), 2, function(x) {
t1 <- subset(df1, name == x[1])
t2 <- subset(df1, name == x[2])
t3 <- t1[-(1:2)] * t2[-(1:2)]
t3$name <- paste0(x, collapse = "_")
cbind(t3, t1[1])
}, simplify = FALSE)
You can then rbind it to original data.
new_data <- rbind(df1, do.call(rbind, data))

Precipitation values for every 5 minutes to hourly summaries in R

I am trying to get the total precipitation values for every hour from a personal weather station I have using the weatherData package. The problem I have is that the data is collected every five minutes and the values repeat themselves until there is a change in precipitation value. I have tried the 'duplicated' function but I get a large number of data removed when there is no precipitation which makes it hard for me to get a summary of the hourly precipitation.
Please see code below
## Load required libraries
library(weatherData)
library(ggplot2)
library(scales)
library(plyr)
library(reshape2)
library(gridExtra)
library(lubridate)
library(weathermetrics)
library(zoo)
# Get data for PWS using weatherData package
pws <- getWeatherForDate("IPENANGB2", "2014-09-01","2014-09-30", station_type = "id",opt_detailed=T, opt_custom_columns=T, custom_columns=c(1,2,6,7,10))
# Rename columns
colnames(pws)<-c("time","time1","tempc","wdd","wspd","prcp")
## Adding date columns
pws$time<-as.POSIXct(pws$time1,format="%Y-%m-%d %H:%M:%S",tz="Australia/Perth")
pws$year <- as.numeric(format(pws$time,"%Y"))
pws$date <-as.Date(pws$time,format="%Y-%m-%d",tz="Australia/Perth")
pws$year <- as.numeric(as.POSIXlt(pws$date)$year+1900)
pws$month <- as.numeric(as.POSIXlt(pws$date)$mon+1)
pws$monthf <- factor(pws$month,levels=as.character(1:12),labels=c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"),ordered=TRUE)
pws$weekday <- as.POSIXlt(pws$date)$wday
pws$weekdayf <- factor(pws$weekday,levels=rev(0:6),labels=rev(c("Mon","Tue","Wed","Thu","Fri","Sat","Sun")),ordered=TRUE)
pws$yearmonth <- as.yearmon(pws$date)
pws$yearmonthf <- factor(pws$yearmonth)
pws$week <- as.numeric(format(as.Date(pws$date),"%W"))
pws$weekf<- factor(pws$week)
pws$jday<-yday(pws$date)
pws$hour <- as.numeric(format(strptime(pws$time, format = "%Y-%m-%d %H:%M"),format = "%H"))
pws$min <- as.numeric(format(strptime(pws$time, format = "%Y-%m-%d %H:%M"),format = "%M"))
# Remove duplicate values
pws.df <- pws[!duplicated(pws$prcp),]
Assuming you want to get hourly averages of tempc, wdd, wspd, prcp:
# used packages
library(weatherData)
library(lubridate)
library(dplyr)
library(stringr)
# read data
pws <- getWeatherForDate("IPENANGB2",
"2014-09-01",
"2014-09-30",
station_type = "id",
opt_detailed = T,
opt_custom_columns = T,
custom_columns = c(1, 2, 6, 7, 10))
# rename columns
colnames(pws) <- c("time", "time1", "tempc", "wdd", "wspd", "prcp")
# cleaning dataset and adding some columns
useful_pws <-
pws %>%
select(2:6) %>%
filter(!str_detect(time1, "<br>")) %>%
mutate(time1 = ymd_hms(time1),
year = year(time1),
month = month(time1),
day = day(time1),
hour = hour(time1)) %>%
tbl_df()
# summarising dataset
useful_pws %>%
select(-time1) %>%
group_by(year, month, day, hour) %>%
summarise(tempc = mean(tempc, na.rm = TRUE),
wdd = mean(wdd, na.rm = TRUE),
wspd = mean(wspd, na.rm = TRUE),
prcp = mean(prcp, na.rm = TRUE))

Resources