Match list elements based on attribute component - r

I have a data set that i split into two list int1 and int2.
library(lubridate)
library(tidyverse)
library(purrr)
date <- rep_len(seq(dmy("01-01-2011"), dmy("01-01-2013"), by = "days"), 300)
ID <- rep(c("A","B", "C"), 300)
df <- data.frame(date = date,
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID)
df$month <- month(df$date)
df$year <- year(df$date)
# Create first list
int1 <- df %>%
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(month == "1") %>%
group_split()
# Create second list
int2 <- df %>%
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(month == "2") %>%
group_split()
names(int1) <- sapply(int1, function(x) paste(x$ID[1],
x$year[1], sep = '_'))
names(int2) <- sapply(int2, function(x) paste(x$ID[1],
x$year[1], sep = '_'))
I then assign a attribute to each list (match). I created a function check to grab this attribute more easily. I removed some elements from one list for this exmaple.
int1 <- int1[-c(3,6)]
# Convenience function to grab the attributes for you
check <- function(x) {
return(attr(x, "match"))
}
# Add an attribute to hold the attributes of each list element
attr(int1, "match") <- data.frame(id = sapply(int1, function(x) paste(x$ID[1])),
interval_start_date = sapply(int1, function(x) paste(x$new[1]))
)
# Check the attributes
check(int1)
# Add an attribute "tab" to hold the attributes of each list element
attr(int2, "match") <- data.frame(id = sapply(int2, function(x) paste(x$ID[1])),
interval_start_date = sapply(int2, function(x) paste(x$new[1]))
)
# Check the attributes
check(int2)
I would like to remove elements that are not in another based on the attribute that I had added. Specifically I would like to remove any that don't have the same interval_start_date and ID associated with it. For the interval_start_date, only the year and the day have to match, as the month will most likely differ between the two list. In this case, I would like int2 to match int1. Any thoughts on how I could do this? A base r method is preferred, if possible.
# Expected results
expected_int2 <- list(int2[[1]], int2[[2]], int2[[3]], int2[[4]], int2[[5]],
int2[[6]], int2[[7]])
names(expected_int2) <- sapply(int1, function(x) paste(x$ID[1],
x$year[1], sep = "_"))

We may create an index with %in% after pasteing the 'id' and the formatted 'interval_start_date' i.e. after removing the 'month' part
i1 <- with(check(int2), paste(id, format(as.Date(interval_start_date),
"%Y-%d"))) %in% with(check(int1), paste(id,
format(as.Date(interval_start_date), "%Y-%d")))
> which(i1)
[1] 1 2 4 5 7 8 9
out <- int2[i1]

Related

Function to assign names to multiple list elements

I have a list of data frames int1 and int2. The end goal of this code is to assign the names to the elements in int1 and int2. The rest of the workflow for my work requires me to name the elements of the list multiple times, and I was wondering how I could create a function to reduce the number of keystrokes down the line using base r functions. Any ideas?
library(lubridate)
library(tidyverse)
library(purrr)
date <- rep_len(seq(dmy("01-01-2011"), dmy("31-07-2011"), by = "days"), 200)
ID <- rep(c("A","B", "C"), 200)
df <- data.frame(date = date,
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID)
df$Month <- month(df$date)
# Create first list
int1 <- df %>%
# arrange(ID) %>% # skipped for readability of result
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(Month == "1") %>%
group_split()
# Create second list
int2 <- df %>%
# arrange(ID) %>% # skipped for readability of result
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(Month == "2") %>%
group_split()
# Expected Output
# Assign names to int1
names(int1) <- sapply(int1, function(x) paste(x$ID[1],
x$new[1], sep = "_"))
# Assign names to int2
names(int2) <- sapply(int2, function(x) paste(x$ID[1],
x$new[1], sep = "_"))
Using group_split will not name the list elements. It is specified in the ?group_split
it does not name the elements of the list based on the grouping as this typically loses information and is confusing.
Instead use split from base R, which will return with the names pasteed using . from the 'ID', 'new' columns
int1 <- df %>%
# arrange(ID) %>% # skipped for readability of result
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(Month == "1") %>% ungroup %>%
{split(., .[c('ID', 'new')])}
Similarly for int2

Remove list elements that are not present in another list based on element names

I have two list that I am working with int1 and int2. Both list have similar names for the list elements. I would like to remove specific components in one list, in this case int2 that are not present in another list int1. Is there a good way to do this in base R? I would like my results to look like the expected_int2.
library(lubridate)
library(tidyverse)
library(purrr)
date <- rep_len(seq(dmy("01-01-2011"), dmy("31-07-2011"), by = "days"), 200)
ID <- rep(c("A","B", "C"), 200)
df <- data.frame(date = date,
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID)
df$Month <- month(df$date)
# Create first list
int1 <- df %>%
# arrange(ID) %>% # skipped for readability of result
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(Month == "1") %>%
group_split()
# Assign names to int1
names(int1) <- sapply(int1, function(x) paste(x$ID[1],
x$new[1], sep = "_"))
#Remove list elements for the example
int1 <- int1[-c(6, 8, 9)]
# Create second list
int2 <- df %>%
# arrange(ID) %>% # skipped for readability of result
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(ID, new) %>%
filter(Month == "2") %>%
group_split()
# Assign names to int2
names(int2) <- sapply(int2, function(x) paste(x$ID[1],
x$new[1], sep = "_"))
# Expected results
expected_int2 <- list(int2[[1]], int2[[2]], int2[[3]], int2[[4]], int2[[5]], int2[[6]])
names(expected_int2) <- sapply(int1, function(x) paste(x$ID[1],
x$new[1], sep = "_"))
We can remove the month part from the names, to check if they are similar to subset
i1 <- sub("(.*)-\\d+-(.*)", "\\1-\\2", names(int2)) %in%
sub("(.*)-\\d+-(.*)", "\\1-\\2", names(int1))
out <- int2[i1]
names(out) <- names(int1)

Filtering uneven data sets

I am trying to filter out a data set into two months. I would like to filter out the ID and year that have data, and to remove the ID and year that do not have an associated pair.
For example if an ID and year has both the January and July month in the data set, I would like to include this ID and the year in my filtered data. If an ID has only the month of January and not July, I would like to remove this data and not include it in the filtered data set. Is there a good way to do this? Just a note that I wasn't sure how to simulate the uneven data set in the example.
After filtering for my desired output, I test by creating a list for each seasonal month where each ID and year has at least 15 rows associated with it.
library(lubridate)
library(dplyr)
set.seed(12345)
df <- tibble(
date = sample(seq(dmy("01-01-2010"), dmy("31-12-2013"), by = "days"),
1000, replace = TRUE),
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID = rep(1:5, 200),
month = month(date),
year =year(date)) %>%
arrange(ID, date)
df %>%
filter(month %in% c(1,7)) %>%
group_by(ID, year) %>%
mutate(complete = length(unique(month)) == 2) %>%
group_by(ID) %>%
filter(all(complete)) %>%
group_by(ID, year)
# Creates a list for each year and by ID
summer_list <- df %>%
filter(month %in% 7) %>%
filter(n() >= 15) %>%
group_split(year, ID)
# Renames the names in the list to AnimalID and year
names(summer_list) <- sapply(summer_list,
function(x) paste(x$ID[1],
x$year[1], sep = '_'))
# Creates a list for each year and by ID
winter_list <- df1 %>%
filter(month %in% 1) %>%
filter(n() >= 15) %>%
group_split(year, ID)
# Renames the names in the list to ID and year
names(winter_list) <- sapply(winter_list,
function(x) paste(x$ID[1],
x$year[1], sep = '_'))
You were really close. I think your filter can be simplified to the following. Just be sure to save it to df.
df <- df %>%
filter(month %in% c(1,7)) %>%
group_by(ID, year) %>%
mutate(complete = length(unique(month)) == 2) %>%
filter(complete)
# could add "%>% select(-c(complete))" to get rid of complete
On summer_list and winter_list, add a group_by between the filters. With the dataset you provided, there were no groups with 15 records, but I tested that this works by bumping up the size of df until I got some.
summer_list <- df %>%
filter(month == 7) %>% # used == since there's only one test value
group_by(ID, year) %>% # added this
filter(n() >= 15) %>%
group_split()
There's also a typo in your first use of winter_list -- the input data is df1, but I think you want df. Hope this works!
Here's the complete code including the larger df:
library(lubridate)
library(dplyr)
set.seed(12345)
df <- tibble(
date = sample(seq(dmy("01-01-2010"), dmy("31-12-2013"), by = "days"),
4000, replace = TRUE),
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID = rep(1:5, 800),
month = month(date),
year =year(date)) %>%
arrange(ID, date)
df <- df %>%
filter(month %in% c(1,7)) %>%
group_by(ID, year) %>%
mutate(complete = length(unique(month)) == 2) %>%
filter(complete)
# could add "%>% select(-c(complete))" to get rid of complete
# Creates a list for each year and by ID
summer_list <- df %>%
filter(month == 7) %>%
group_by(ID, year) %>%
filter(n() >= 15) %>%
group_split()
# Renames the names in the list to AnimalID and year
names(summer_list) <- sapply(summer_list,
function(x) paste(x$ID[1],
x$year[1], sep = '_'))
# Creates a list for each year and by ID
winter_list <- df %>%
filter(month == 1) %>%
group_by(ID, year) %>%
filter(n() >= 15) %>%
group_split()
# Renames the names in the list to ID and year
names(winter_list) <- sapply(winter_list,
function(x) paste(x$ID[1],
x$year[1], sep = '_'))

Creating list with the same number of values

I have a data set with a date, ID, and coordinates that I would like to split into seasonal months. For example for winter I have January to winter1, February to winter2, and March to winter3. I have done the same for the summer months.
I would like to filter out the IDs that have all of these months, so that when I split the data by ID and year, I would have identical list lengths.
I wasn't sure how to simulate uneven values for each ID in the sample code below, but in my actual data some IDs only have summer1 and not winter1, while it could be flipped around for summer2 and winter2`.
library(lubridate)
library(tidyverse)
date <- rep_len(seq(dmy("01-01-2010"), dmy("31-12-2013"), by = "days"),1000)
ID <- rep(seq(1, 5), 100)
df <- data.frame(date = date,
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID)
df$month <- month(df$date)
df$year <- year(df$date)
df1 <- df %>%
mutate(season_categ = case_when(month %in% 6 ~ 'summer1',
month %in% 7 ~ 'summer2',
month %in% 8 ~ 'summer3',
month %in% 1 ~ 'winter1',
month %in% 2 ~ 'winter2',
month %in% 3 ~ 'winter3')) %>%
group_by(year, ID )%>%
filter(any(month %in% 6:8) &
any(month %in% 1:3))
summer_list <- df1 %>%
filter(season_categ == "summer1") %>%
group_split(year, ID)
# Renames the names in the list to AnimalID and year
names(summer_list) <- sapply(summer_list,
function(x) paste(x$ID[1],
x$year[1], sep = '_'))
# Creates a list for each year and by ID
winter_list <- df1 %>%
filter(season_categ == "winter1") %>%
group_split(year, ID)
names(winter_list) <- sapply(winter_list,
function(x) paste(x$ID[1],
x$year[1], sep = '_'))
Not sure if that is what you want, but I understood that you would want to get rid of IDs that have less than the 6 months of Q1 and Q3 in any of the years, but you could modify the filter or grouping if that assumption was wrong.
Here is one approach:
library(lubridate)
library(dplyr)
set.seed(12345)
# random sampling of dates with this seed gives no July date for ID 2 in 2010
df <- tibble(
date = sample(seq(dmy("01-01-2010"), dmy("31-12-2013"), by = "days"),
1000, replace = TRUE),
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID = rep(1:5, 200),
month = month(date),
year =year(date)) %>%
arrange(ID, date)
df %>%
filter(month %in% c(1:3, 6:8)) %>%
group_by(ID, year) %>%
mutate(complete = length(unique(month)) == 6) %>%
group_by(ID) %>%
filter(all(complete)) %>%
group_by(ID, year) %>%
group_split()
To me it is not really clear as to what your are looking for. Before you split the data into a list sort the rows by columns
df1<-df1[order(ID,season_categ),]
### Determine which ID's have uneven numbers ###
df1 %>%
group_by(ID) %>%
summarize(month_seq = paste(season_categ , collapse = "_"),
number_of_months = n(season_categ))
#### Remove odd numbers###

removing groups with a certain NA number

Sorry to bother with a relatively simple question perhaps.
I have this type of dataframe:
A long list of names in the column "NAME" c(a, b, c, d, e ...) , two potential classes in the column "SURNAME" c(A, B) and a third column containing values.
I want to remove all NAMES for which at least in one of the SURNAME classes I have more than 2 "NA" in the VALUE column.
I wanted to post an example dataset but I am struggling to format it properly
I was trying to use
df <- df %>%
group_by(NAME) %>%
group_by(SURNAME) %>%
filter(!is.na(VALUE)) %>%
filter(length(VALUE)>=3)
it does not throw an error but I have the impression that something is wrong. Any suggestion? Many thanks
Let's create a dataset to work with:
set.seed(1234)
df <- data.frame(
name = sample(x=letters, size=1e3, replace=TRUE),
surname = sample(x=c("A", "B"), size=1e3, replace=TRUE),
value = sample(x=c(1:10*10,NA), size=1e3, replace=TRUE),
stringsAsFactors = FALSE
)
Here's how to do it with Base R:
# count NAs by name-surname combos (na.action arg is important!)
agg <- aggregate(value ~ name + surname, data=df, FUN=function(x) sum(is.na(x)), na.action=NULL)
# rename is count of NAs column
names(agg)[3] <- "number_of_na"
#add count of NAs back to original data
df <- merge(df, agg, by=c("name", "surname"))
# subset the original data
result <- df[df$number_of_na < 3, ]
Here's how to do it with data.table:
library(data.table)
dt <- as.data.table(df)
dt[ , number_of_na := sum(is.na(value)), by=.(name, surname)]
result <- dt[number_of_na < 3]
Here's how to do it with dplr/tidyverse:
library(dplyr) # or library(tidyverse)
result <- df %>%
group_by(name, surname) %>%
summarize(number_of_na = sum(is.na(value))) %>%
right_join(df, by=c("name", "surname")) %>%
filter(number_of_na < 3)
After grouping by 'NAME', 'SURNAME', create a column with the number of NA elements in that group and then filter out any 'NAME' that have an 'ind' greater than or equal to 3
df %>%
group_by(NAME, SURNAME) %>%
mutate(ind = sum(is.na(VALUE))) %>%
group_by(NAME) %>%
filter(!any(ind >=3)) %>%
select(-ind)
Or do an anti_join after doing the filtering by 'NAME', 'SURNAME' based on the condition
df %>%
group_by(NAME, SURNAME) %>%
filter(sum(is.na(VALUE))>=3) %>%
ungroup %>%
distinct(NAME) %>%
anti_join(df, .)
data
set.seed(24)
df <- data.frame(NAME = rep(letters[1:5], each = 20),
SURNAME = sample(LETTERS[1:4], 5 * 20, replace = TRUE),
VALUE = sample(c(NA, 1:3), 5 *20, replace = TRUE),
stringsAsFactors = FALSE)

Resources