I have data of from each of the avalanches that occurred. I need to calculate the number of avalanches that occurred by each year and month but the data just gives the exact days that an avalanche occurred. How do I sum the number of occurrences that occurred during each year-month? I also only need the winter related year-months (Dec (12) - March (3)). Please help!
library(XML)
library(RCurl)
library(dplyr)
avalanche<-data.frame()
avalanche.url<-"https://utahavalanchecenter.org/observations?page="
all.pages<-0:202
for(page in all.pages){
this.url<-paste(avalanche.url, page, sep="")
this.webpage<-htmlParse(getURL(this.url))
thispage.avalanche<-readHTMLTable(this.webpage, which=1, header=T,stringsAsFactors=F)
names(thispage.avalanche)<-c('Date','Region','Location','Observer')
avalanche<-rbind(avalanche,thispage.avalanche)
}
# subset the data to the Salt Lake Region
avalancheslc<-subset(avalanche, Region=="Salt Lake")
str(avalancheslc)
The output should look something like:
Date AvalancheTotal
2000-01 1
2000-02 2
2000-03 8
2000-12 23
2001-01 16
.
.
.
.
.
2019-03 45
Using dplyr, you could get the variable of interest ("year-month") from the Date column, group by this variable, and then compute the number of rows in each group.
In a similar way, you can filter to only get the months you like:
library(dplyr)
winter_months <- c(1:3, 12)
avalancheslc %>%
mutate(Date = as.Date(Date, "%m/%d/%Y")) %>%
mutate(YearMonth = format(Date,"%Y-%m"),
Month = as.numeric(format(Date,"%m"))) %>%
filter(Month %in% winter_months) %>%
group_by(YearMonth) %>%
summarise(AvalancheTotal = n())
We can convert to yearmon from zoo and use that in the group_by to get the number of rows
library(dplyr)
library(zoo)
dim(avalancheslc)
#[1] 5494 4
out <- avalancheslc %>%
group_by(Date = format(as.yearmon(Date, "%m/%d/%Y"), "%Y-%m")) %>%
summarise(AvalancheTotal = n())
If we need only output from December to March, then filter the data
subOut <- out %>%
filter(as.integer(substr(Date, 6, 7)) %in% c(12, 1:3))
Or it can be filtered earlier in the chain
library(lubridate)
out <- avalancheslc %>%
mutate(Date = as.yearmon(Date, "%m/%d/%Y")) %>%
filter(month(Date) %in% c(12, 1:3)) %>%
count(Date)
dim(out)
#[1] 67 2
Now, for filling with 0's
mths <- month.abb[c(12, 1:3)]
out1 <- crossing(Months = mths,
Year = year(min(out$Date)):year(max(out$Date))) %>%
unite(Date, Months, Year, sep= " ") %>%
mutate(Date = as.yearmon(Date)) %>%
left_join(out) %>%
mutate(n = replace_na(n, 0))
tail(out1)
# A tibble: 6 x 2
# Date n
# <S3: yearmon> <dbl>
#1 Mar 2014 100
#2 Mar 2015 94
#3 Mar 2016 96
#4 Mar 2017 93
#5 Mar 2018 126
#6 Mar 2019 163
Related
I have a daily temperature data for more than 40 years. Here is the sample data:
date tmax
1 1971-01-01 18.9
2 1971-01-02 19.0
3 1971-01-03 19.5
4 1971-01-04 19.2
5 1971-01-05 19.5
.
.
.
17536 2020-12-29 19.7
17537 2020-12-30 18.9
I want to calculate the mean value of temperature for the growing period of crop ie from 7 June to 9 November for each year. How can we do this in r?
Sample Data
library(dplyr)
set.seed(1)
d <- tibble(date = seq(as.Date("1971-1-1"), Sys.Date(), by = "day")) %>%
mutate(tmax = round(rnorm(nrow(d), 20, 3), 1))
A tidyverse solution
library(lubridate)
library(tidyr)
d %>%
mutate(is_crop_season = date %within% interval(ISOdate(year(date), 6, 7),
ISOdate(year(date), 11, 9))) %>%
group_by(is_crop_season, year = year(date)) %>%
summarise(mean = mean(tmax)) %>%
pivot_wider(year,
is_crop_season,
names_glue = "{ifelse(is_crop_season, 'crop_season', 'no_crop_season')}",
values_from = mean)
Here's an option -
Get date, month and year from the data, filter and keep only the rows from 7 June to 9th November and get average of tmax value for each year.
library(dplyr)
library(lubridate)
df %>%
mutate(date = as.Date(date),
day = day(date),
month = month(date),
year = year(date)) %>%
filter(between(month, 7, 10) |
day >= 7 & month == 6 |
day <= 9 & month == 11) %>%
group_by(year) %>%
summarise(tmax = mean(tmax, na.rm = TRUE))
I'm trying to process the weather data specified below. I thought I was on the right track but the pivot_longer is not being used in the correct manor and is causing partial duplicates.
Can anyone offer any suggestions as to how I can edit my code? I guess one way would be to perform the pivot_longer after splitting the dataframe into several dataframes i.e. first dataframe - jan, year, second dataframe - feb, year.
maxT <- read.table('https://www.metoffice.gov.uk/pub/data/weather/uk/climate/datasets/Tmax/ranked/England_S.txt', skip = 5, header = TRUE) %>%
select(c(1:24)) %>%
pivot_longer(cols = seq(2,24,2) , values_to = "year") %>%
mutate_at(c(1:12), ~as.numeric(as.character(.))) %>%
pivot_longer(cols = c(1:12), names_to = "month", values_to = "tmax") %>%
mutate(month = match(str_to_title(month), month.abb),
date = as.Date(paste(year, month, 1, sep = "-"), format = "%Y-%m-%d")) %>%
select(-c("name","year","month")) %>%
arrange(date)
Here is an option with tidyverse, using map2
library(dplyr)
library(purrr)
list_df <- maxT %>%
select(seq(1, ncol(.), by = 2)) %>%
map2(maxT %>%
select(seq(2, ncol(.), by = 2)), bind_cols) %>%
imap( ~ .x %>%
rename(!! .y := `...1`, year = `...2`))
-output
map(list_df, head)
#$jan
# A tibble: 6 x 2
# jan year
# <dbl> <int>
#1 9.9 1916
#2 9.8 2007
#3 9.7 1921
#4 9.7 2008
#5 9.5 1990
#6 9.4 1975
#$feb
# A tibble: 6 x 2
# feb year
# <dbl> <int>
#1 11.2 2019
#2 10.7 1998
#3 10.7 1990
#4 10.3 2002
#5 10.3 1945
#6 10 2020
# ...
data
maxT <- read.table('https://www.metoffice.gov.uk/pub/data/weather/uk/climate/datasets/Tmax/ranked/England_S.txt', skip = 5, header = TRUE) %>%
select(c(1:24))
We can use split.default to split group of 2 columns.
list_df <- split.default(maxT, ceiling(seq_along(maxT)/2))
data
maxT <- read.table('https://www.metoffice.gov.uk/pub/data/weather/uk/climate/datasets/Tmax/ranked/England_S.txt', skip = 5, header = TRUE) %>%
select(c(1:24))
I've got a dataset that looks like this
#Determine the M, M-1 and M-2 values for every site
df = data.frame(Site=c(rep('x',5),rep('y',5),rep('z',5)),
Date=c(as.Date('2020/09/11'),as.Date('2020/09/11'),
as.Date('2020/08/02'),as.Date('2020/07/09'),
as.Date('2020/05/15'),as.Date('2020/08/02'),
as.Date('2020/07/10'),as.Date('2020/07/19'),
as.Date('2020/06/27'),as.Date('2020/03/01'),
as.Date('2020/04/07'),as.Date('2020/01/01'),
as.Date('2019/11/10'),as.Date('2019/11/11'),
as.Date('2019/11/07')))
I'm trying to transform the dataset such that
I pick the top 3 most recent months from the Date column for each Site
Spread the Date column such that it is in 'M' 'M-1' and 'M-2' format
'M' being most recent month, 'M-1' being second most recent month, 'M-2' being third most recent month
Convert Date from YYYY/MM/DD format to Month-Year format e.g. 2020/01/01 = JAN 2020
My attempt
For step 1.
transformed = df %>%
group_by(Site) %>%
arrange(desc(Date)) %>%
slice(1:3)
This works fine up till here. I've been trying various ways of spreading the dates to the desired format, but I haven't succeeded. I'm a bit new to this so any inputs would help a lot.
Desired output
I'd like my desired output to look like this
#Output table
output = data.frame(Site=c('x','y','z'),
M=c('SEP 2020','AUG 2020','APR 2020'),
`M-1`=c('AUG 2020','JUL 2020','JAN 2020'),
`M-2`=c('JUL 2020','JUN 2020','NOV 2019'))
colnames(output)=c('Site','M','M-1','M-2')
Modification: September 21, 2020
I've slightly modified the original dataframe df to factor in cases where we have more than one date in a given month. In such a situation, how can the solutions you've proposed be modified to get the same output?
Here is one way with dplyr :
library(dplyr)
df %>%
#Get month and year information from date
mutate(month = toupper(format(Date, '%b %Y'))) %>%
#arrange data in descending order
arrange(Site, desc(Date)) %>%
#for each site
group_by(Site) %>%
#Create a unique id number
mutate(id = match(month, unique(month))) %>%
#select only 3 unique id's for each site
filter(id <= 3) %>%
#Create column name
mutate(col = c('M', 'M-1', 'M-2')[id]) %>%
#remove columns which are not required
select(-Date, -id) %>%
#get data in wide format
tidyr::pivot_wider(names_from = col, values_from = month, values_fn = unique)
# Site M `M-1` `M-2`
# <chr> <chr> <chr> <chr>
#1 x SEP 2020 AUG 2020 JUL 2020
#2 y AUG 2020 JUL 2020 JUN 2020
#3 z APR 2020 JAN 2020 NOV 2019
Try:
df %>%
group_by(Site) %>%
arrange(desc(Date)) %>%
slice(1:3) %>%
mutate(
id_within = str_c("M_", 1:3),
Date = format(Date, "%b %Y")
) %>%
pivot_wider(
id_cols = Site,
names_from = id_within,
values_from = Date
)
You can also try this solution which is close to what you want:
library(tidyverse)
#Code
df %>%
arrange(Site,desc(Date)) %>%
group_by(Site) %>%
mutate(Month=format(Date,'%b %Y')) %>%
#Index for top month
mutate(index=1:n()) %>%
#Filter
filter(index<=3) %>%
#Labels
mutate(Label=paste0('M-',(1:n())-1)) %>%
ungroup() %>%
select(-c(Date,index)) %>%
pivot_wider(names_from=Label,values_from=Month)
Output:
# A tibble: 3 x 4
Site `M-0` `M-1` `M-2`
<fct> <chr> <chr> <chr>
1 x Sep 2020 Aug 2020 Jul 2020
2 y Aug 2020 Jul 2020 Jun 2020
3 z Apr 2020 Jan 2020 Nov 2019
Update: You can try this alternative for you new data:
#Code 2
df %>%
arrange(Site,desc(Date)) %>%
group_by(Site) %>%
mutate(Month=format(Date,'%b %Y')) %>%
#Remove duplicated dates
filter(!duplicated(Month)) %>%
#Index for top month
mutate(index=1:n()) %>%
#Filter
filter(index<=3) %>%
#Labels
mutate(Label=paste0('M-',(1:n())-1)) %>%
ungroup() %>%
select(-c(Date,index)) %>%
pivot_wider(names_from=Label,values_from=Month)
Output:
# A tibble: 3 x 4
Site `M-0` `M-1` `M-2`
<fct> <chr> <chr> <chr>
1 x Sep 2020 Aug 2020 Jul 2020
2 y Aug 2020 Jul 2020 Jun 2020
3 z Apr 2020 Jan 2020 Nov 2019
Use slice_max:
library(tidyverse)
tibble(df) %>%
group_by(Site) %>%
slice_max(order_by = Date, n = 3) %>%
mutate(Date = format(Date, '%b %Y') %>% toupper(),
m = paste0("M-", 0:(n()-1)) %>% str_replace(., "-0", "")) %>%
pivot_wider(names_from = m, values_from = Date) %>% ungroup()
Output:
# A tibble: 3 x 4
Site M `M-1` `M-2`
<fct> <chr> <chr> <chr>
1 x SEP 2020 AUG 2020 JUL 2020
2 y AUG 2020 JUL 2020 JUN 2020
3 z APR 2020 JAN 2020 NOV 2019
I am new to R and I have problems with calculating the amount of bill for each month. I have the dataframe as below:
dat <- data.frame(
time = factor(c("Breakfast","Breakfast","Breakfast","Breakfast","Breakfast","Breakfast"), levels=c("Breakfast")), date=c("2020-01-20","2020-01-21","2020-01-22","2020-02-10","2020-02-11","2020-02-12"),
total_bill = c(12.7557,14.8,17.23,15.7,16.9,13.2)
)
My goal is to calculate the amount spending on the Breakfast for each month so here we have two months and I want to get the total sum of January and February separately.
Any help for this would be much appreciated. Thank you!
Does this answer your question?
sums <- tapply(dat$total_bill, format(as.Date(dat$date), "%B"), sum)
February January
45.8000 44.7857
sumsis a list: so if you want to access, for example, the datum for February, you can do this:
sums[1]
February
45.8
Alternatively, you can convert sums into a dataframe and access the monthly sums via the month names:
sums <- as.data.frame.list(tapply(dat$total_bill, format(as.Date(dat$date), "%B"), sum))
sums$February
45.8
Addition:
Another (fun) solution is via regex: you define the dates as a pattern and, using sub plus backreference \\1 to recall the two numbers between the dashes, reduce them to the months part:
tapply(dat$total_bill, sub("\\d{4}-(\\d{2})-\\d{2}", "\\1", dat$date), sum)
01 02
44.7857 45.8000
We can convert the 'date' to Date class, get the month, and use that as grouping column and sum the 'total_bill'
library(dplyr)
dat %>%
group_by(time, Month = format(as.Date(date), "%B")) %>%
summarise(total_bill = sum(total_bill, na.rm = TRUE))
# A tibble: 2 x 3
# Groups: time [1]
# time Month total_bill
# <fct> <chr> <dbl>
#1 Breakfast February 45.8
#2 Breakfast January 44.8
We can convert it to 'wide' format, if that is needed
library(tidyr)
out <- dat %>%
group_by(time, Month = format(as.Date(date), "%B")) %>%
summarise(total_bill = sum(total_bill, na.rm = TRUE)) %>%
pivot_wider(names_from = Month, values_from = total_bill)
out
# A tibble: 1 x 3
# Groups: time [1]
# time February January
# <fct> <dbl> <dbl>
# 1 Breakfast 45.8 44.8
If we also need to group by 'year'
out <- dat %>%
mutate(date = as.Date(date)) %>%
group_by(time, Year = format(date, "%Y"), Month = format(date, "%B")) %>%
summarise(total_bill = sum(total_bill, na.rm = TRUE))
library(dplyr)
d_sum <- dat %>%
group_by(substr(date, 0, 7)) %>%
summarise(sum = sum(total_bill))
d_sum
# A tibble: 2 x 2
`substr(date, 0, 7)` sum
<chr> <dbl>
1 2020-01 44.8
2 2020-02 45.8
I am using the baby names data in R for practice.
total_n <-babynames %>%
mutate(name_gender = paste(name,sex))%>%
group_by(year) %>%
summarise(total_n = sum(n, na.rm=TRUE)) %>%
arrange(total_n)
bn <- inner_join(babynames,total_n,by = "year")
df <- bn%>%
mutate(pct_of_names = n/total_n)%>%
group_by(name, year)%>%
summarise(pct =sum(pct_of_names))
The dataframe output looked like this:
For each name, there's all the years, and the related pct for that year. I am stuck with getting the year with the highest pct for each name. How do I do this?
Pretty simple, once you know where the babynames data comes from. You had everything needed:
library(dplyr)
library(babynames)
total_n <-babynames %>%
mutate(name_gender = paste(name,sex))%>%
group_by(year) %>%
summarise(total_n = sum(n, na.rm=TRUE)) %>%
arrange(total_n)
bn <- inner_join(babynames,total_n,by = "year")
df <- bn%>%
mutate(pct_of_names = n/total_n)%>%
group_by(name, year)%>%
summarise(pct =sum(pct_of_names))
You were missing this final step:
df %>%
group_by(name) %>%
filter(pct == max(pct))
# A tibble: 95,025 x 3
# Groups: name [95,025]
name year pct
<chr> <dbl> <dbl>
1 Aaban 2014 4.338256e-06
2 Aabha 2014 2.440269e-06
3 Aabid 2003 1.316094e-06
4 Aabriella 2015 1.363073e-06
5 Aada 2015 1.363073e-06
6 Aadam 2015 5.997520e-06
7 Aadan 2009 6.031433e-06
8 Aadarsh 2014 4.880538e-06
9 Aaden 2009 3.335645e-04
10 Aadesh 2011 1.370356e-06
# ... with 95,015 more row
group_by and filter are your friends.