Return values in sucessive years - r

First question on here, so hopefully I've done this correctly!
I have a large dataset, the following is a small sample:
id <- c(1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 6)
year <- c("2010", "2011", "2012", "2014", "2012", "2013", "2011", "2012", "2013", "2010", "2011", "2012", "2013", "2008", "2009", "2011")
value <- c(100, 33, 80, 90, 80, 100, 100, 90, 80, 90, 80, 100, 100, 90, 80, 99)
df <- data.frame(id, year, value)
df
For each id I want to return the values of two successive years so that I can compare the value in year n to year n+1. Where there are not two successive years then don't return anything for that id.
The output should be as follows:
id <- c(1, 1, 2, 3, 3, 4, 4, 4, 5)
year <- c("2010", "2011", "2012", "2011", "2012", "2010", "2011", "2012", "2008")
yvalue <- c(100, 33, 80, 100, 90, 90, 80, 100, 90)
yearadd1 <- c("2011", "2012", "2013", "2012", "2013", "2011", "2012", "2013", "2009")
valueadd1 <- c(33, 80, 100, 90, 80, 80, 100, 100, 80)
df <- data.frame(id, year, yvalue, yearadd1, valueadd1)
df
How do I get r to give me this output?
The main difficulty I face is that for id = 1 the first pair of successive years are 2010 and 2011, whereas for id = 4 they are 2008 and 2009, so I can't define what the first year is as it varies by id.

We can use data.table. Convert the 'data.frame' to 'data.table' (setDT(df)), grouped by 'id', we loop through the columns 'year', 'value' and get the lead observation using shift, assign (:=) it to new columns and remove the NA rows (na.omit). Then get the row ids (.I) where the successive elements in 'yearadd1' is equal to 1, and extract those rows.
library(data.table)
nm1 <- names(df)[2:3]
dt <- na.omit(setDT(df)[, paste0(nm1, "add1") := lapply(.SD, shift, type = "lead"),
by = id, .SDcols = nm1])
dt[dt[, .I[c(TRUE, diff(as.numeric(as.character(yearadd1)))==1)], id]$V1]
# id year value yearadd1 valueadd1
#1: 1 2010 100 2011 33
#2: 1 2011 33 2012 80
#3: 2 2012 80 2013 100
#4: 3 2011 100 2012 90
#5: 3 2012 90 2013 80
#6: 4 2010 90 2011 80
#7: 4 2011 80 2012 100
#8: 4 2012 100 2013 100
#9: 5 2008 90 2009 80

Related

R dplyr perform different aggregation by group

I have a dataframe dat which looks like this:
dat <- structure(list(cell.ID = c(329574L, 329574L, 329574L, 329574L,
329574L, 329574L, 329574L, 329574L, 329574L, 329574L, 329574L,
329574L), Year = c("2010", "2010", "2010", "2010", "2010", "2010",
"2010", "2010", "2010", "2010", "2010", "2010"), month_name = c("June",
"July", "June", "July", "June", "July", "June", "July", "June",
"July", "June", "July"), value = c(459.860986624053, 398.94083733151,
16, 23, 111.69, 453.333, 71.55, 30.38, 31.928, 30.13355, 17.587,
19.7938709677419), variable_name = c("ETo", "ETo", "Rday", "Rday",
"Rsum", "Rsum", "Thdd", "Thdd", "Tmax", "Tmax", "Tmin", "Tmin"
), monthID = c(6L, 7L, 6L, 7L, 6L, 7L, 6L, 7L, 6L, 7L, 6L, 7L
)), row.names = c(NA, -12L), class = "data.frame")
library(dplyr)
dat %>%
dplyr::group_by(Year, variable_name) %>%
dplyr::summarise(variable = sum(value))
If I want to average the Tmax and Tmin and sum the rest of the variables, I did this
dat %>%
dplyr::group_by(Year, variable_name) %>%
dplyr::summarise(variable = ifelse(variable_name %in% c('Tmax', 'Tmin'), mean(value), sum(value)))
Error: Column `variable` must be length 1 (a summary value), not 2
How do I correct this?
Another way to do this is dplyr is to use if and else instead of ifelse:
dat %>%
group_by(Year, variable_name) %>%
summarise(variable = if (variable_name[1] %in% c('Tmax', 'Tmin')) mean(value) else sum(value))
# A tibble: 6 x 3
# Groups: Year [1]
Year variable_name variable
<chr> <chr> <dbl>
1 2010 ETo 859.
2 2010 Rday 39
3 2010 Rsum 565.
4 2010 Thdd 102.
5 2010 Tmax 31.0
6 2010 Tmin 18.7
I think the problem is that ifelse in this context is operating row-wise, not at the level of the group. If that's right, then you could work around the problem by getting both summary statistics and then conditionally selecting the one you want by variable name, like this:
dat %>%
dplyr::group_by(Year, variable_name) %>%
dplyr::summarise(var_mean = mean(value), var_sum = sum(value)) %>%
dplyr::mutate(variable = ifelse(variable_name %in% c('Tmax', 'Tmin'), var_mean, var_sum)) %>%
dplyr::select(-var_mean, -var_sum)
Result:
# A tibble: 6 x 3
# Groups: Year [1]
Year variable_name variable
<chr> <chr> <dbl>
1 2010 ETo 859.
2 2010 Rday 39
3 2010 Rsum 565.
4 2010 Thdd 102.
5 2010 Tmax 31.0
6 2010 Tmin 18.7

Mutate and case when issue - dplyr

I have the following data and I was to make a new column using mutate which details when colour = 'g' then take the level on the g row minus the level figure on the 'r' row.
Then likewise with type. Where type = 1 then take the corresponding level minus the level on the type 2 row.
library(dplyr)
d <- tibble(
date = c("2018", "2018", "2018", "2019", "2019", "2019", "2020", "2020", "2020", "2020"),
colour = c("none","g", "r", "none","g", "r", "none", "none", "none", "none"),
type = c("type1", "none", "none", "type2", "none", "none", "none", "none", "none", "none"),
level= c(78, 99, 45, 67, 87, 78, 89, 87, 67, 76))
Just to be clear this is what I want the data to look like.
So the data should look like this:
d2 <- tibble(
date = c("2018", "2018", "2018", "2019", "2019", "2019", "2020", "2020", "2020", "2020"),
colour = c("none","g", "r", "none","g", "r", "none", "none", "none", "none"),
type = c("type1", "none", "none", "type2", "none", "none", "none", "none", "none", "none"),
level= c(78, 99, 45, 67, 87, 78, 89, 87, 67, 76),
color_gap = c("NULL", 44, "NULL", "NULL", 9, "NULL", "NULL", "NULL", "NULL", "NULL"),
type_gap = c(11, "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL"))
I started to use mutate and case when and got to the below. However, I'm stuck on the final calculation part. How do I say I want to take the color g level - the color r level?
d %>%
mutate(color_gap = case_when(color == "g" ~ level)%>%
mutate(type_gap = case_when(type== "type1" ~ level)%>%
) -> d2
Anyone know how to complete this?
Thanks
This subtracts the first r level from the first g level, second r level from second g level, etc. Same for type1 and type2. This has no checks at all. It doesn't check whether there is a matching r for each g, whether they are in the expected order, whether they are in the same date-group, etc. It assumes the data is already perfectly formatted as expected, so be careful using this on real data.
d %>%
mutate(color_gap = replace(rep(NA, n()), colour == 'g',
level[colour == 'g'] - level[colour == 'r']),
type_gap = replace(rep(NA, n()), type == 'type1',
level[type == 'type1'] - level[type == 'type2']))
# # A tibble: 10 x 6
# date colour type level color_gap type_gap
# <chr> <chr> <chr> <dbl> <dbl> <dbl>
# 1 2018 none type1 78 NA 11
# 2 2018 g none 99 54 NA
# 3 2018 r none 45 NA NA
# 4 2019 none type2 67 NA NA
# 5 2019 g none 87 9 NA
# 6 2019 r none 78 NA NA
# 7 2020 none none 89 NA NA
# 8 2020 none none 87 NA NA
# 9 2020 none none 67 NA NA
# 10 2020 none none 76 NA NA
you could do this with group_by and mutate.
I assumed that there is only 1 row per date that would satisfy each condition.
d %>%
mutate(color_gap = case_when(colour == "g" ~ level)) %>%
mutate(type_gap = case_when(type== "type1" ~ level)) %>%
group_by(date) %>%
mutate(diff = max(color_gap,na.rm=T)-max(type_gap, na.rm=T))

How to make rownames first column and replace a given cell with a value in a list of data frames?

I have a list of roughly 200 datasets where each data set looks like:
Year 2010 2011 2012 2013 2014
Womenpct 0.6 0.8 0.7 0.6 0.7
Menpct 0.4 0.2 0.3 0.4 0.3
What I want to achieve is to first replace the rownames in each dataset to
Year 2010 2011 2012 2013 2014
Women 0.6 0.8 0.7 0.6 0.7
Men 0.4 0.2 0.3 0.4 0.3
Data:
df <- list(`1` = structure(c("2010", "0.5388350", "0.4611650", "2011",
"0.5360517", "0.4639483", "2012", "0.5460852", "0.4539148", "2013",
"0.5401961", "0.4598039", "2014", "0.5475490", "0.4524510"), .Dim = c(3L,
5L), .Dimnames = list(c("Year", "Womenpct", "Menpct"), NULL)),
`2` = structure(c("2010", "0.5388350", "0.4611650", "2011",
"0.5360517", "0.4639483", "2012", "0.5460852", "0.4539148",
"2013", "0.5401961", "0.4598039", "2014", "0.5475490", "0.4524510"
), .Dim = c(3L, 5L), .Dimnames = list(c("Year", "Womenpct",
"Menpct"), NULL)))
After this I'm trying to make the rownames the first column, and want all datasets have the following structure
Year 2010 2011 2012 2013 2014
Women Women 0.6 0.8 0.7 0.6 0.7
Men Men 0.4 0.2 0.3 0.4 0.3
For the rownames I've tried using lapply to change them
df <- lapply(df, function(x) rownames(x)[1] <- "Women")
But this only returns a single character in each data frame.
To insert a column I've also tried to use lapply
lapply(df, function(x) add_column(x, Gender = "", .before = 1))
Which seems to convert my data frames into new lists instead.
Is there some way I can get R to apply the desired functions for all data frames in my list?
Any help would be greatly appreciated.
Based on the dput it looks like you have list of matrices with column names as first row. We can do
lapply(df, function(x) {
#Convert matrix to dataframe
temp <- as.data.frame(x, stringsAsFactors = FALSE)
#Remove pct from rownames
rownames(temp) <- sub("pct$", "", rownames(temp))
#Add rownames as new column
temp$Year <- rownames(temp)
#Assign first row as column names
names(temp) <- temp[1, ]
#Remove first row
temp[-1, ]
})
#$`1`
# 2010 2011 2012 2013 2014 Year
#Women 0.5388350 0.5360517 0.5460852 0.5401961 0.5475490 Women
#Men 0.4611650 0.4639483 0.4539148 0.4598039 0.4524510 Men
#$`2`
# 2010 2011 2012 2013 2014 Year
#Women 0.5388350 0.5360517 0.5460852 0.5401961 0.5475490 Women
#Men 0.4611650 0.4639483 0.4539148 0.4598039 0.4524510 Men
data
df <- list(`1` = structure(c("2010", "0.5388350", "0.4611650", "2011",
"0.5360517", "0.4639483", "2012", "0.5460852", "0.4539148", "2013",
"0.5401961", "0.4598039", "2014", "0.5475490", "0.4524510"), .Dim = c(3L,
5L), .Dimnames = list(c("Year", "Womenpct", "Menpct"), NULL)),
`2` = structure(c("2010", "0.5388350", "0.4611650", "2011",
"0.5360517", "0.4639483", "2012", "0.5460852", "0.4539148",
"2013", "0.5401961", "0.4598039", "2014", "0.5475490", "0.4524510"
), .Dim = c(3L, 5L), .Dimnames = list(c("Year", "Womenpct",
"Menpct"), NULL)))
Forgoing apply functions, this should do what you want:
for (i in seq_along(table.list)) {
df <- table.list[[i]]
rownames(df)[-1] <- c("Women", "Men")
df <- cbind(c("", rownames(df)[-1]), df)
table.list[[i]] <- df
}

Identifying the column that has the lowest value

I have a data frame given by the following
DF <- structure(list(ID = c(1, 129, 169, 1087), `Collab Years Patents` = c(NA,
"2011, 2011, 2011", "2010", "2006, 2006"), `Collab Years Publications` = c("2011",
"2015, 2016, 2016", "2010", NA), ECP = c("2011", "2011", "2010",
"2006")), .Names = c("ID", "Collab Years Patents", "Collab Years Publications",
"ECP"), row.names = c(1L, 107L, 136L, 859L), class = "data.frame")
The column ECP is the minimum year of the two collaboration columns (which could contain several years). I need an output that says which column the ECP belongs to. For example, a solution to above could be another column vector to above frame with the elements:
structure(list(ID = c(1, 129, 169, 1087), `Collab Years Patents` = c(NA,
"2011, 2011, 2011", "2010", "2006, 2006"), `Collab Years Publications` = c("2011",
"2015, 2016, 2016", "2010", NA), ECP = c("2011", "2011", "2010",
"2006"), identifier = c("Publications", "Patents", "Both", "Patents"
)), .Names = c("ID", "Collab Years Patents", "Collab Years Publications",
"ECP", "identifier"), row.names = c(1L, 107L, 136L, 859L), class = "data.frame")
Here is an option using str_detect. Loop through the collaboration columns (sapply(DF[2:3],), use str_detect to check which one of the column have the value of 'ECP'. multiply by col to convert the TRUE values to the column index, replace the NA elements with 0, get the column names correspond based on the maximum column index, remove the prefix part of the column names with sub, and assign those elements in 'm1' that are greater than 0 i.e. have 'ECP' in both to 'Both' on the created vector 'v1'
library(stringr)
m1 <- col(DF[2:3]) *sapply(DF[2:3], function(x) str_detect(x, DF$ECP))
m1[is.na(m1)] <- 0
v1 <- sub(".*\\s(\\w+)$", "\\1", names(DF)[2:3][max.col(m1)])
v1[rowSums(m1 > 0) ==2] <- "Both"
DF$identifier <- v1
DF$identifier
#[1] "Publications" "Patents" "Both" "Patents"
Using tidyverse (dplyr and purrr):
library(tidyverse)
DF %>%
mutate_at(2:3,strsplit,", ") %>%
transmute(identifier = pmap(.[2:4],~c("Publications","Patents","Both")[
2*(..3 %in% .x) + (..3 %in% .y)])) %>%
bind_cols(DF,.)
# ID Collab Years Patents Collab Years Publications ECP identifier
# 1 1 <NA> 2011 2011 Publications
# 2 129 2011, 2011, 2011 2015, 2016, 2016 2011 Patents
# 3 169 2010 2010 2010 Both
# 4 1087 2006, 2006 <NA> 2006 Patents

convert year-doy to mdy

I would like to convert a column (or create a new one) which is year-day of year to m/d/y. Originally I had year and day-of-year as two separate columns, but I concatenated (paste) them together because I thought I would need the year included with the day of year because of leap years. I am not opposed to using an additional package such as date.
Here is my data:
dat <- structure(list(doy = c(320, 350, 309, 310, 328, 321, 301, 338,
304, 304, 308), year = structure(1:11, .Label = c("2000", "2001",
"2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009",
"2010"), class = "factor"), conc = c("2000-320", "2001-350",
"2002-309", "2003-310", "2004-328", "2005-321", "2006-301", "2007-338",
"2008-304", "2009-304", "2010-308")), row.names = c(NA, -11L), class = "data.frame", .Names = c("doy",
"year", "conc"))
And looks like:
doy year conc
1 320 2000 2000-320
2 350 2001 2001-350
3 309 2002 2002-309
4 310 2003 2003-310
5 328 2004 2004-328
6 321 2005 2005-321
7 301 2006 2006-301
8 338 2007 2007-338
9 304 2008 2008-304
10 304 2009 2009-304
11 308 2010 2010-308
-cherrytree
No additional packages necessary:
within(dat, dtime <- as.POSIXct(conc, format='%Y-%j'))
Something like this works.
as.Date( paste(as.character(dat$year), "-01-01",sep="")) + dat$doy - 1
Just adds the day of the year (minus one) to Jan 1 of the year.

Resources