Select unique entries showing at least one value from another column - r

I have the following dataset (32000 entries) of water chemical compounds annual means organized by monitoring sites and sampling year:
data= data.frame(Site_ID=c(1, 1, 1, 2, 2, 2, 3, 3, 3), Year=c(1976, 1977, 1978, 2004, 2005, 2006, 2003, 2004, 2005), AnnualMean=c(1.1, 1.2, 1.1, 2.1, 2.6, 3.1, 2.7, 2.6, 1.9))
Site_ID Year AnnualMean
1 1976 1.1
1 1977 1.2
1 1978 1.1
2 2004 2.1
2 2005 2.6
2 2006 3.1
3 2003 2.7
3 2004 2.6
3 2005 1.9
I would like to select the data only from all monitoring sites showing at least a measurement in 2005 in their time range. With the above dataset, the expect output dataset would be:
Site_ID Year AnnualMean
2 2004 2.1
2 2005 2.6
2 2006 3.1
3 2003 2.7
3 2004 2.6
3 2005 1.9
I am completely new in R and have been spinning my head around with data manipulation, so thank you in advance!

With dplyr:
library(dplyr)
data %>%
group_by(Site_ID) %>%
filter(2005 %in% Year)

Here is a base R solution, using subset + ave
dfout <- subset(df,!!ave(Year,Site_ID,FUN = function(x) "2005" %in% x))
such that
> dfout
Site_ID Year AnnualMean
4 2 2004 2.1
5 2 2005 2.6
6 2 2006 3.1
7 3 2003 2.7
8 3 2004 2.6
9 3 2005 1.9

An option with data.table
library(data.table)
setDT(data)[, .SD[2005 %in% Year], Site_ID]

Related

Calculating the change in % of data by year

I am trying to calculate the % change by year in the following dataset, does anyone know if this is possible?
I have the difference but am unsure how we can change this into a percentage
C diff(economy_df_by_year$gdp_per_capita)
df
year gdp
1998 8142.
1999 8248.
2000 8211.
2001 7926.
2002 8366.
2003 10122.
2004 11493.
2005 12443.
2006 13275.
2007 15284.
Assuming that gdp is the total value, you could do something like this:
library(tidyverse)
tribble(
~year, ~gdp,
1998, 8142,
1999, 8248,
2000, 8211,
2001, 7926,
2002, 8366,
2003, 10122,
2004, 11493,
2005, 12443,
2006, 13275,
2007, 15284
) -> df
df |>
mutate(pdiff = 100*(gdp - lag(gdp))/gdp)
#> # A tibble: 10 × 3
#> year gdp pdiff
#> <dbl> <dbl> <dbl>
#> 1 1998 8142 NA
#> 2 1999 8248 1.29
#> 3 2000 8211 -0.451
#> 4 2001 7926 -3.60
#> 5 2002 8366 5.26
#> 6 2003 10122 17.3
#> 7 2004 11493 11.9
#> 8 2005 12443 7.63
#> 9 2006 13275 6.27
#> 10 2007 15284 13.1
Which relies on the tidyverse framework.
If gdp is the difference, you will need the total to get a percentage, if that is what you mean by change in percentage by year.
df$change <- NA
df$change[2:10] <- (df[2:10, "gdp"] - df[1:9, "gdp"]) / df[1:9, "gdp"]
This assigns the yearly GDP growth to each row except the first one where it remains as NA
df$diff <- c(0,diff(df$gdp))
df$percentDiff <- 100*(c(0,(diff(df$gdp)))/(df$gdp - df$diff))
This is another possibility.

Find average change in timeseries

I have an annual mean timeseries dataset for 15 years, and I am trying to find the average change/increase/decrease in this timeseries.
The timeseries I have is spatial (average values for each grid-cell/pixel, years repeat).
How can I do this in R via dplyr?
Sample data
year = c(2005, 2005, 2005, 2005, 2006, 2006, 2006, 2006, 2007, 2007, 2007, 2007, 2008, 2008, 2008, 2008)
Tmean = c(24, 24.5, 25.8,25, 24.8, 25, 23.5, 23.8, 24.8, 25, 25.2, 25.8, 25.3, 25.6, 25.2, 25)
Code
library(tidyverse)
df = data.frame(year, Tmean)
change = df$year %>%
# Sort by year
arrange(year) %>%
mutate(Diff_change = Tmean - lag(Tmean), # Difference in Tmean between years
Rate_percent = (Diff_change / year)/Tmean * 100) # Percent change # **returns inf values**
Average_change = mean(change$Rate_percent, na.rm = TRUE)
To find the average: mean(). To find the differences or changes: diff()
So, to find the average change:
> avg_change <- mean(diff(Tmean))
> print(avg_change)
[1] 0.06666667
If you need that in percentage, then you want to find out how much the difference between an element and its previous one (this year - last year) is in percentage with respect to last year, like so:
> pct_change <- Tmean[2:length(Tmean)] / Tmean[1:(length(Tmean)-1)] - 1
> avg_pct_change <- mean(pct_change) * 100
> print(avg_pct_change)
[1] 0.3101632
We can put those vectors into a data frame to use with dplyr (...if that's how you want to do it; this is straightforward with base R as well).
library(dplyr)
df <- data.frame(year, Tmean)
change <- df %>%
arrange(year) %>%
mutate(Diff_change = Tmean - lag(Tmean), # Difference in Tmean between years
Diff_time = year - lag(year),
Rate_percent = (Diff_change/Diff_time)/lag(Tmean) * 100) # Percent change
Average_change = mean(change$Rate_percent, na.rm = TRUE)
Results (with updated question data)
> change
year Tmean Diff_change Rate_percent
1 2005 24.0 NA NA
2 2005 24.5 0.5 2.0833333
3 2005 25.8 1.3 5.3061224
4 2005 25.0 -0.8 -3.1007752
5 2006 24.8 -0.2 -0.8000000
6 2006 25.0 0.2 0.8064516
7 2006 23.5 -1.5 -6.0000000
8 2006 23.8 0.3 1.2765957
9 2007 24.8 1.0 4.2016807
10 2007 25.0 0.2 0.8064516
11 2007 25.2 0.2 0.8000000
12 2007 25.8 0.6 2.3809524
13 2008 25.3 -0.5 -1.9379845
14 2008 25.6 0.3 1.1857708
15 2008 25.2 -0.4 -1.5625000
16 2008 25.0 -0.2 -0.7936508
> Average_change
[1] 0.3101632

How to count how many values were used in a mean() function?

I am trying to create a column in a data frame containing how many values were used in the mean function for each line.
First, I had a data frame df like this:
df <- data.frame(tree_id=rep(c("CHC01", "CHC02"),each=8),
rad=(c(rep("A", 4),rep("B", 4), rep("A", 4),
rep("C", 4))), year=rep(2015:2018, 4),
growth= c(NA, NA, 1.2, 3.2, 2.1, 1.5, 2.3, 2.7, NA, NA, NA, 1.7, 3.5, 1.4, 2.3, 2.7))
Then, I created a new data frame called avg_df, containing only the mean values of growth grouped by tree_id and year
library(dplyr)
avg_df <- df%>%
group_by(tree_id, year, add=TRUE)%>%
summarise(avg_growth=mean(growth, na.rm = TRUE))
Now, I would like to add a new column in avg_df, containing how much values I used for calculating the mean growth for each tree_id and year, ignoring the NA.
Example: for CHC01 in 2015, the result is 1, because it was the average of 2.1 and NA and
for CHC01 in 2018, it will be 2, because the result is the average of 3.2 and 2.7
Here is the expected output:
avg_df$radii <- c(1,1,2,2,1,1,1,2)
tree_id year avg_growth radii
CHC01 2015 2.1 1
CHC01 2016 1.5 1
CHC01 2017 1.75 2
CHC01 2018 2.95 2
CHC02 2015 3.5 1
CHC02 2016 1.4 1
CHC02 2017 2.3 1
CHC02 2018 2.2 2
*In my real data, the values in radii will vary from 1 to 4.
Could anyone help me with this?
Thank you very much!
We can get the sum of non-NA elements (!is.na(growth)) after grouping by 'tree_id' and 'year'
library(dplyr)
df %>%
group_by(tree_id, year) %>%
summarise(avg_growth=mean(growth, na.rm = TRUE),
radii = sum(!is.na(growth)))
# A tibble: 8 x 4
# Groups: tree_id [2]
# tree_id year avg_growth radii
# <fct> <int> <dbl> <int>
#1 CHC01 2015 2.1 1
#2 CHC01 2016 1.5 1
#3 CHC01 2017 1.75 2
#4 CHC01 2018 2.95 2
#5 CHC02 2015 3.5 1
#6 CHC02 2016 1.4 1
#7 CHC02 2017 2.3 1
#8 CHC02 2018 2.2 2
Or using data.table
library(data.table)
setDT(df)[, .(avg_growth = mean(growth, na.rm = TRUE),
radii = sum(!is.na(growth))), by = .(tree_id, year)]

Conversion of monthly data to yearly data in a dataframe in r

I have a dataframe showing monthly mgpp from 2000-2010:
dataframe1
Year Month mgpp
1: 2000 1 0.01986404
2: 2000 2 0.011178429
3: 2000 3 0.02662008
4: 2000 4 0.05034293
5: 2000 5 0.23491388
---
128: 2010 8 0.13234501
129: 2010 9 0.10432369
130: 2010 10 0.04329537
131: 2010 11 0.04343289
132: 2010 12 0.09494946
I am trying to convert this dataframe1 into a raster that will show the variable mgpp. However I want to format the dataframe first which will show only the yearly mgpp. The expected outcome is shown below :
dataframe1
Year mgpp
1: 2000 0.01986704
2: 2001 0.01578429
3: 2002 0.02662328
4: 2003 0.05089593
5: 2004 0.07491388
6: 2005 0.11229201
7: 2006 0.10318569
8: 2007 0.07129537
9: 2008 0.04373689
10: 2009 0.02885386
11: 2010 0.74848348
I want to aggregate the months by mean. For instance, 2000 value shows one value that is the mean from Jan-Dec for the 2000 year.How can I achieve this? Help would be appreciated
Here a data.table approach.
library(data.table)
setDT(dataframe1)[,.(Yearly.mgpp = mean(mgpp)),by=Year]
Year Yearly.mgpp
1: 2000 0.06858387
2: 2010 0.08366928
Or if you prefer dplyr.
library(dplyr)
dataframe1 %>%
group_by(Year) %>%
summarise(Yearly.mgpp = mean(mgpp))
# A tibble: 2 x 2
Year Yearly.mgpp
<dbl> <dbl>
1 2000 0.0686
2 2010 0.0837
Or base R.
result <- sapply(split(dataframe1$mgpp,dataframe1$Year),mean)
data.frame(Year = as.numeric(names(result)),Yearly.mgpp = result)
Year Yearly.mgpp
2000 2000 0.06858387
2010 2010 0.08366928
Sample Data
dataframe1 <- structure(list(Year = c(2000, 2000, 2000, 2000, 2000, 2010, 2010,
2010, 2010, 2010), Month = c(1, 2, 3, 4, 5, 8, 9, 10, 11, 12),
mgpp = c(0.01986404, 0.011178429, 0.02662008, 0.05034293,
0.23491388, 0.13234501, 0.10432369, 0.04329537, 0.04343289,
0.09494946)), class = "data.frame", row.names = c(NA, -10L
))

Matching DFs on two columns and multiplying

I have a dataframe such as the following one, only with much more columns and an additional ID variable.
data <- data.frame(year = c(rep(2014,12), rep(2015,12)), month = c(seq(1,12), seq(1,12)), value = c(rep(5,24)))
The data for some year/month combinations is incorrect, and must be adjusted by multiplying by a factor for the periods shown below.
fix <- data.frame(year = c(2014, 2014, 2015), month = c(1, 5, 6), f = c(.9, 1.1, 12))
I'm currently doing this via ddply, but I'm looking for a more elegant solution:
factorize <- function(x) {
x$value = x$value * fix[fix$year == unique(x$year) & fix$month == unique(x$month),3]
x
}
data2 <- ddply(data, c("year", "month"), factorize)
Any thoughts or suggestions?
Thanks!
Here's a base R approach:
transform(merge(data, fix, all.x=TRUE), value = ifelse(is.na(f), value, value*f), f=NULL)
And in case you need faster performance you can use data.table:
library(data.table)
data <- merge(setDT(data), setDT(fix), all.x = TRUE, by = c("year", "month"))
data[!is.na(f), value := value*f]
data[,f := NULL]
I think that with one line of code with dplyr and ifelse you can achieve your goal.
data %>% mutate(fix = ifelse( year == fix$year &
month == fix$month,
fix$f, value)) %>% select(-value)
year month fix
1 2014 1 0.9
2 2014 2 5.0
3 2014 3 5.0
4 2014 4 5.0
5 2014 5 1.1
6 2014 6 5.0
7 2014 7 5.0
8 2014 8 5.0
9 2014 9 5.0
10 2014 10 5.0
11 2014 11 5.0
12 2014 12 5.0
13 2015 1 5.0
14 2015 2 5.0
15 2015 3 5.0
16 2015 4 5.0
17 2015 5 5.0
18 2015 6 12.0
19 2015 7 5.0
20 2015 8 5.0
21 2015 9 5.0
22 2015 10 5.0
23 2015 11 5.0
24 2015 12 5.0

Resources