How to split dataframe into multiple dataframes by column index - r

I'm trying to process the weather data specified below. I thought I was on the right track but the pivot_longer is not being used in the correct manor and is causing partial duplicates.
Can anyone offer any suggestions as to how I can edit my code? I guess one way would be to perform the pivot_longer after splitting the dataframe into several dataframes i.e. first dataframe - jan, year, second dataframe - feb, year.
maxT <- read.table('https://www.metoffice.gov.uk/pub/data/weather/uk/climate/datasets/Tmax/ranked/England_S.txt', skip = 5, header = TRUE) %>%
select(c(1:24)) %>%
pivot_longer(cols = seq(2,24,2) , values_to = "year") %>%
mutate_at(c(1:12), ~as.numeric(as.character(.))) %>%
pivot_longer(cols = c(1:12), names_to = "month", values_to = "tmax") %>%
mutate(month = match(str_to_title(month), month.abb),
date = as.Date(paste(year, month, 1, sep = "-"), format = "%Y-%m-%d")) %>%
select(-c("name","year","month")) %>%
arrange(date)

Here is an option with tidyverse, using map2
library(dplyr)
library(purrr)
list_df <- maxT %>%
select(seq(1, ncol(.), by = 2)) %>%
map2(maxT %>%
select(seq(2, ncol(.), by = 2)), bind_cols) %>%
imap( ~ .x %>%
rename(!! .y := `...1`, year = `...2`))
-output
map(list_df, head)
#$jan
# A tibble: 6 x 2
# jan year
# <dbl> <int>
#1 9.9 1916
#2 9.8 2007
#3 9.7 1921
#4 9.7 2008
#5 9.5 1990
#6 9.4 1975
#$feb
# A tibble: 6 x 2
# feb year
# <dbl> <int>
#1 11.2 2019
#2 10.7 1998
#3 10.7 1990
#4 10.3 2002
#5 10.3 1945
#6 10 2020
# ...
data
maxT <- read.table('https://www.metoffice.gov.uk/pub/data/weather/uk/climate/datasets/Tmax/ranked/England_S.txt', skip = 5, header = TRUE) %>%
select(c(1:24))

We can use split.default to split group of 2 columns.
list_df <- split.default(maxT, ceiling(seq_along(maxT)/2))
data
maxT <- read.table('https://www.metoffice.gov.uk/pub/data/weather/uk/climate/datasets/Tmax/ranked/England_S.txt', skip = 5, header = TRUE) %>%
select(c(1:24))

Related

Create new variable in from subset of existing columns and if value present create duplicate rows in R

I have a list of columns where three of them relate to year, which identifies the occurrence of that data being collected for that year (1 or -1 means to be collected). I would like to create / manipulate the dataframe to have 'Year' as a new variable, which would replicate the other columns if it is to be collected for multiple years. What is the most efficient way to do this using dplyr / tidyverse?
I am aware of perhaps using pivot_longer but I assume I made need also to use case_when or similar. Any help appreciated.
Programme <- c(1, 2, 2, 3, 4)
Function <- c("Inv", "Inv", "Mac", "Inv", "Inv")
Year2020 <- c(1, 1, 1, -1, 1)
Year2021 <- c(1, 1, *NA*, *NA*, *NA*)
Year2022 <- c(*NA*, *NA*, *NA*, -1, -1)
df <- data.frame(Programme, Function, Year2020, Year2021, Year2022)
So what I am trying to produce is this:
Year <- c(2020, 2021, 2020, 2021, 2020, 2020, 2022, 2020, 2022)
Programme <- c(1, 1, 2, 2, 2, 3, 3, 4, 4)
Function <- c("Inv", "Inv", "Inv", "Inv", "Mac", "Inv", "Inv", "Inv", "Inv")
df <- data.frame(Year, Programme, Function)
Update see comments Darren Tsai:"Adding names_transform = list(Year = parse_number) into pivot_longer() can save you the mutate line. Besides, in tidyr there is a more flexible function than na.omit() to treat NA, i.e. drop_na()"
library(tidyr)
library(dplyr)
df %>%
pivot_longer(
cols = starts_with("Year"),
names_to = "Year",
names_transform = list(Year = parse_number)
) %>%
drop_na() %>%
dplyr::select(Year, Programme, Function)
Alternative pivot method using names_pattern and regex "(\\D+)(\\d+)":
library(tidyr)
library(dplyr)
df %>%
pivot_longer(
cols = starts_with("Year"),
names_to = c(".value", "Year1"),
names_pattern = "(\\D+)(\\d+)"
) %>%
na.omit() %>%
dplyr::select(Year = Year1, Programme, Function)
OR using parse_number from readr package:
library(tidyverse)
df %>%
pivot_longer(
cols = starts_with("Year"),
names_to = "Year"
) %>%
mutate(Year = parse_number(Year)) %>%
na.omit() %>%
dplyr::select(Year, Programme, Function)
Year Programme Function
<chr> <dbl> <chr>
1 2020 1 Inv
2 2021 1 Inv
3 2020 2 Inv
4 2021 2 Inv
5 2020 2 Mac
6 2020 3 Inv
7 2022 3 Inv
8 2020 4 Inv
9 2022 4 Inv
Using dplyr, tidyr and stringr packages:
library(tidyverse)
df |>
pivot_longer(3:5,
names_to = "Year") |>
filter(value == 1 | value == -1) |>
mutate(Year = str_remove(Year,"^Year")) |>
select(Year, Programme, Function)
Output:
# A tibble: 9 x 3
Year Programme Function
<chr> <dbl> <chr>
1 2020 1 Inv
2 2021 1 Inv
3 2020 2 Inv
4 2021 2 Inv
5 2020 2 Mac
6 2020 3 Inv
7 2022 3 Inv
8 2020 4 Inv
9 2022 4 Inv

Perform the operation on the columns of dataframe by ignoring specific characters in r

I have a problem with performing the operation in dataframe when there are some missing values in those columns of dataframe. My goal is just ignoring them (not to delete the row or column with the missing value) and perform the operation as normal.
Here is my dataframe:
dat <- data.frame(
time = factor(c("Breakfast","Breakfast","Lunch","Lunch","Breakfast","Dinner","Dinner","Dinner","Snack","Snack","Lunch","Snack","Snack"), levels=c("Breakfast","Lunch","Dinner","Snack")),
total_bill_x = c("*",14.89,20.5,17.23,30.3,"*",20.7,32.3,25.4,14.5,13.7,14.2,15.7), total_bill_y= c(20.75,"*",18.52,"*",27.3,23.6,19.75,27.3,21.48,13.66,15.59,17.3,14.78)
)
I want to perform the operation like sum(dat$total_bill_x) sum(dat$total_bill_y).
The columns are factor, which needs to be converted to numeric. There are elements such as * which would become NA in the process and gives a friendly warning
library(dplyr)
dat %>%
summarise_at(vars(starts_with('total')), ~
sum(as.numeric(as.character(.)), na.rm = TRUE))
If we don't want the warnings, one option is to selectively remove those * elements before doing the sum
dat %>%
summarise_at(vars(starts_with('total')),
~ sum(as.numeric(as.character(.[.!= "*"]))))
# total_bill_x total_bill_y
#1 219.42 220.03
or in base R
sapply(dat[-1], function(x) sum(as.numeric(as.character(x[x!= "*"]))))
# total_bill_x total_bill_y
# 219.42 220.03
Or change the * to NA, convert to numeric, extract the column and sum
dat[-1] <- lapply(dat[-1], function(x)
as.numeric(replace(as.character(x), x == "*", NA)))
sum(dat$total_bill_x, na.rm = TRUE)
If we want to do a group by operation, one option is to convert the * to NA with na_if(from dplyr), convert to numeric (as.numeric), grouped by 'time', get the sum of the column in summarise and create the 'pourcentage' column by dividing the 'total' with the sum of 'total'
dat %>%
mutate_at(vars(starts_with('total')), ~ as.numeric(na_if(., "*"))) %>%
group_by(time) %>%
summarise(total = sum(total_bill_x, na.rm = TRUE)) %>%
mutate(pourcentage=total/sum(total)*100)
# A tibble: 4 x 3
# time total pourcentage
# <fct> <dbl> <dbl>
#1 Breakfast 16 20.8
#2 Lunch 17 22.1
#3 Dinner 21 27.3
#4 Snack 23 29.9
We can also do
dat %>%
mutate(across(starts_with('total'), readr::parse_number)) %>%
summarise(across(starts_with('total'),
~sum(., na.rm = TRUE), names = "total_{col}"))
We can use readr::parse_number
library(dplyr)
dat %>%
mutate_at(vars(starts_with('total')), readr::parse_number) %>%
summarise_at(vars(starts_with('total')), sum, na.rm = TRUE)
# total_bill_x total_bill_y
#1 219.42 220.03
To sum by group i.e time.
dat %>%
mutate_at(vars(starts_with('total')), readr::parse_number) %>%
group_by(time) %>%
summarise_at(vars(starts_with('total')), sum, na.rm = TRUE)
# time total_bill_x total_bill_y
# <fct> <dbl> <dbl>
#1 Breakfast 45.2 48.0
#2 Lunch 51.4 34.1
#3 Dinner 53 70.6
#4 Snack 69.8 67.2

Convert List to a Dataframe

I am trying to convert the following list to a dataframe.
I have tried melt/cast, ldply, unlist etc but can't seem to get the expected output.
Many thanks in advance!
df <- list(
name=rep(c(11,12), each=1),
value=rnorm(2),
name=rep(c(13,14), each=1),
value=rnorm(2)
)
df
I want the following output in a dataframe:
name value
11 1.187
12 0.691
13 0.452
14 0.898
An option is to stack into a two column data.frame, and spread it back to 'wide' format
library(tidyverse)
enframe(df) %>%
unnest(value) %>%
group_by(name) %>%
mutate(rn = row_number()) %>%
spread(name, value) %>%
select(-rn)
# A tibble: 4 x 2
# name value
# <dbl> <dbl>
#1 11 -0.484
#2 12 -0.110
#3 13 -0.328
#4 14 0.0737
Or another option is to make use of pivot_longer from the devel version of tidyr
df %>%
set_names(str_c(names(.), "_", cumsum(names(.) == "name"))) %>%
as_tibble %>%
mutate(rn = row_number()) %>%
pivot_longer(-rn, names_to =c(".value", "group"), names_sep = '_') %>%
select(name, value)
Or using base R
reshape(transform(stack(df), rn = ave(seq_along(ind), ind,
FUN = seq_along)), idvar = 'rn', direction = 'wide', timevar = 'ind')
Here's a way in base R using split -
data.frame(
split(v <- unlist(df), sub("[0-9]+", "", names(v)))
)
name value
1 11 -0.2282623
2 12 -0.8101849
3 13 -0.9311898
4 14 0.3638835
Data -
df <- structure(list(name = c(11, 12), value = c(-0.22826229127103,
-0.810184913338659), name = c(13, 14), value = c(-0.931189778412408,
0.363883463286674)), .Names = c("name", "value", "name", "value"
))
d <- data.frame(
name = unlist(df[names(df) == "name"]),
value = unlist(df[names(df) == "value"])
)
the_list <- list(
name=rep(c(11,12), each=1),
value=rnorm(2),
name=rep(c(13,14), each=1),
value=rnorm(2)
)
df <- data.frame(name = unlist(the_list[which(names(the_list) == "name")]),
value = unlist(the_list[which(names(the_list) == "value")]))
df
# name value
# 1 11 -0.83130395
# 2 12 -0.12782566
# 3 13 2.59769395
# 4 14 -0.06967617

Summing the number of occurrences from m/d/y to y/m

I have data of from each of the avalanches that occurred. I need to calculate the number of avalanches that occurred by each year and month but the data just gives the exact days that an avalanche occurred. How do I sum the number of occurrences that occurred during each year-month? I also only need the winter related year-months (Dec (12) - March (3)). Please help!
library(XML)
library(RCurl)
library(dplyr)
avalanche<-data.frame()
avalanche.url<-"https://utahavalanchecenter.org/observations?page="
all.pages<-0:202
for(page in all.pages){
this.url<-paste(avalanche.url, page, sep="")
this.webpage<-htmlParse(getURL(this.url))
thispage.avalanche<-readHTMLTable(this.webpage, which=1, header=T,stringsAsFactors=F)
names(thispage.avalanche)<-c('Date','Region','Location','Observer')
avalanche<-rbind(avalanche,thispage.avalanche)
}
# subset the data to the Salt Lake Region
avalancheslc<-subset(avalanche, Region=="Salt Lake")
str(avalancheslc)
The output should look something like:
Date AvalancheTotal
2000-01 1
2000-02 2
2000-03 8
2000-12 23
2001-01 16
.
.
.
.
.
2019-03 45
Using dplyr, you could get the variable of interest ("year-month") from the Date column, group by this variable, and then compute the number of rows in each group.
In a similar way, you can filter to only get the months you like:
library(dplyr)
winter_months <- c(1:3, 12)
avalancheslc %>%
mutate(Date = as.Date(Date, "%m/%d/%Y")) %>%
mutate(YearMonth = format(Date,"%Y-%m"),
Month = as.numeric(format(Date,"%m"))) %>%
filter(Month %in% winter_months) %>%
group_by(YearMonth) %>%
summarise(AvalancheTotal = n())
We can convert to yearmon from zoo and use that in the group_by to get the number of rows
library(dplyr)
library(zoo)
dim(avalancheslc)
#[1] 5494 4
out <- avalancheslc %>%
group_by(Date = format(as.yearmon(Date, "%m/%d/%Y"), "%Y-%m")) %>%
summarise(AvalancheTotal = n())
If we need only output from December to March, then filter the data
subOut <- out %>%
filter(as.integer(substr(Date, 6, 7)) %in% c(12, 1:3))
Or it can be filtered earlier in the chain
library(lubridate)
out <- avalancheslc %>%
mutate(Date = as.yearmon(Date, "%m/%d/%Y")) %>%
filter(month(Date) %in% c(12, 1:3)) %>%
count(Date)
dim(out)
#[1] 67 2
Now, for filling with 0's
mths <- month.abb[c(12, 1:3)]
out1 <- crossing(Months = mths,
Year = year(min(out$Date)):year(max(out$Date))) %>%
unite(Date, Months, Year, sep= " ") %>%
mutate(Date = as.yearmon(Date)) %>%
left_join(out) %>%
mutate(n = replace_na(n, 0))
tail(out1)
# A tibble: 6 x 2
# Date n
# <S3: yearmon> <dbl>
#1 Mar 2014 100
#2 Mar 2015 94
#3 Mar 2016 96
#4 Mar 2017 93
#5 Mar 2018 126
#6 Mar 2019 163

max([column])where name = (each unique name in the name column) for each year in R

I am using the baby names data in R for practice.
total_n <-babynames %>%
mutate(name_gender = paste(name,sex))%>%
group_by(year) %>%
summarise(total_n = sum(n, na.rm=TRUE)) %>%
arrange(total_n)
bn <- inner_join(babynames,total_n,by = "year")
df <- bn%>%
mutate(pct_of_names = n/total_n)%>%
group_by(name, year)%>%
summarise(pct =sum(pct_of_names))
The dataframe output looked like this:
For each name, there's all the years, and the related pct for that year. I am stuck with getting the year with the highest pct for each name. How do I do this?
Pretty simple, once you know where the babynames data comes from. You had everything needed:
library(dplyr)
library(babynames)
total_n <-babynames %>%
mutate(name_gender = paste(name,sex))%>%
group_by(year) %>%
summarise(total_n = sum(n, na.rm=TRUE)) %>%
arrange(total_n)
bn <- inner_join(babynames,total_n,by = "year")
df <- bn%>%
mutate(pct_of_names = n/total_n)%>%
group_by(name, year)%>%
summarise(pct =sum(pct_of_names))
You were missing this final step:
df %>%
group_by(name) %>%
filter(pct == max(pct))
# A tibble: 95,025 x 3
# Groups: name [95,025]
name year pct
<chr> <dbl> <dbl>
1 Aaban 2014 4.338256e-06
2 Aabha 2014 2.440269e-06
3 Aabid 2003 1.316094e-06
4 Aabriella 2015 1.363073e-06
5 Aada 2015 1.363073e-06
6 Aadam 2015 5.997520e-06
7 Aadan 2009 6.031433e-06
8 Aadarsh 2014 4.880538e-06
9 Aaden 2009 3.335645e-04
10 Aadesh 2011 1.370356e-06
# ... with 95,015 more row
group_by and filter are your friends.

Resources