Merging two data frames with different numbers of observations and matching them - r

The data frame below presents two data frames that I merged through cbindX(Period1, Period2). Both have the same columns but represent two time periods and have different observations for AEZ.
Example for Abyei and Angola
> dput(new_data2[1:6, c(1,2,3,5,7,8,9,11) ])
structure(list(AEZ_1 = c("Tropics, lowland semi-arid", "Dominantly hydromorphic soils", "Tropics, lowland sub-humid", "Tropics, lowland semi-arid", "Dominantly built-up land", "Dominantly hydromorphic soils"), Country_1 = c("Abyei", "Abyei", "Angola", "Angola", "Angola", "Angola"), File_name_1 = c("PRIO_AEZ_FS_1981_2010", "PRIO_AEZ_FS_1981_2010", "PRIO_AEZ_FS_1981_2010", "PRIO_AEZ_FS_1981_2010", "PRIO_AEZ_FS_1981_2010", "PRIO_AEZ_FS_1981_2010"), Share_1 = c(9418.132755827, 520.625044495, 616817.473747498, 278142.684969026, 1330.4290338252, 74581.3053271609), AEZ_2 = c("Tropics, lowland semi-arid", "Tropics, lowland sub-humid", "Dominantly hydromorphic soils", "Tropics, lowland sub-humid", "Tropics, lowland semi-arid", "Dominantly built-up land"), Country_2 = c("Abyei", "Abyei", "Abyei", "Angola", "Angola", "Angola"), File_name_2 = c("PRIO_AEZ_FS_2011_2040", "PRIO_AEZ_FS_2011_2040", "PRIO_AEZ_FS_2011_2040", "PRIO_AEZ_FS_2011_2040", "PRIO_AEZ_FS_2011_2040", "PRIO_AEZ_FS_2011_2040"), Share_2 = c(8475.525647713, 942.6071081139, 520.625044495, 754641.194306016, 289900.409286599, 1330.4290338252)), row.names = c(NA, 6L), class = "data.frame")
I would like to have matching Country to see the change of AEZ over time.
Image 2
Thanks

Assume you have two data frames (an old and a new one) with country properties:
library(tidyverse)
old <- tribble(
~AEZ, ~Country,
1, "Abyei",
2, "Angola"
) %>%
mutate(time = "old")
old
#> # A tibble: 2 x 3
#> AEZ Country time
#> <dbl> <chr> <chr>
#> 1 1 Abyei old
#> 2 2 Angola old
new <- tribble(
~AEZ, ~Country,
1, "Abyei",
2, "Angola",
3, "Angola"
) %>%
mutate(time = "new")
new
#> # A tibble: 3 x 3
#> AEZ Country time
#> <dbl> <chr> <chr>
#> 1 1 Abyei new
#> 2 2 Angola new
#> 3 3 Angola new
old %>%
full_join(new) %>%
pivot_wider(names_from = time, values_from = AEZ) %>%
unnest(old) %>%
unnest(new)
#> Joining, by = c("AEZ", "Country", "time")
#> Warning: Values are not uniquely identified; output will contain list-cols.
#> * Use `values_fn = list` to suppress this warning.
#> * Use `values_fn = length` to identify where the duplicates arise
#> * Use `values_fn = {summary_fun}` to summarise duplicates
#> # A tibble: 3 x 3
#> Country old new
#> <chr> <dbl> <dbl>
#> 1 Abyei 1 1
#> 2 Angola 2 2
#> 3 Angola 2 3
Created on 2021-09-21 by the reprex package (v2.0.1)

My suggestion is: Rename AEZ variable in the first file (data frame) as AEZ_1981 and the same variable in the second file as AEZ_2011 before merging. This is how you can keep all the information and compare the changes in the merged file.
Best,
Lev

If it helps, I figure out how to do it:
new_data<-merge(Period1, Period2, by.x=c("Country", "AEZ"), by.y=c("Country", "AEZ"), all= TRUE)

Related

Aggregating a dataframe with dplyr in R based on several dummy variables

I am using dplyr to aggregate my dataframe, so it shows percentages of people choosing specific protein design tasks by company size. I have different dummy variables for protein design tasks, because this was a multiple choice question in a survey.
I figured out a way to do this, but my code is very long, because I aggregate the data per task and then join all these separate dataframes together into one. I’m curious whether there is a more elegant (shorter) way to do this?
library(tidyverse)
EarlyAccess <- read_csv("https://dropbox.com/s/antzwk1jh4ldrhi/EarlyAccess_anon.csv?dl=1")
#################### STABILITY ################################################
Proportions_tasks_stability <- EarlyAccess %>%
select(size, Improving.stability..generic..thermal..pH.) %>%
group_by(size, Improving.stability..generic..thermal..pH.) %>%
summarise(count_var_stability=n())%>%
mutate(total_group_by_size = sum(count_var_stability)) %>%
mutate(pc_var_stability=count_var_stability/sum(count_var_stability)*100) %>%
filter(Improving.stability..generic..thermal..pH.=="Improving stability (generic, thermal, pH)") %>%
select(size, Improving.stability..generic..thermal..pH., pc_var_stability)
######################## ACTIVITY #############################################
Proportions_tasks_activity <- EarlyAccess %>%
select(size, Improving.activity ) %>%
group_by(size, Improving.activity) %>%
summarise(count_var_activity=n())%>%
mutate(total_group_by_size = sum(count_var_activity)) %>%
mutate(pc_var_activity=count_var_activity/sum(count_var_activity)*100) %>%
filter(Improving.activity=="Improving activity") %>%
select(size, Improving.activity, pc_var_activity)
######################## BINDING AFFINITY ######################################
Proportions_tasks_binding.affinity<- EarlyAccess %>%
select(size, Improving.binding.affinity ) %>%
group_by(size, Improving.binding.affinity) %>%
summarise(count_var_binding.affinity=n())%>%
mutate(total_group_by_size = sum(count_var_binding.affinity)) %>%
mutate(pc_var_binding.affinity=count_var_binding.affinity/sum(count_var_binding.affinity)*100) %>%
filter(Improving.binding.affinity=="Improving binding affinity") %>%
select(size, Improving.binding.affinity, pc_var_binding.affinity)
# Then join them
Protein_design_tasks <- Proportions_tasks_stability %>%
inner_join(Proportions_tasks_activity, by = "size") %>%
inner_join(Proportions_tasks_binding.affinity, by = "size")
Using the datafile you provided, this should give the percentages of the selected category within each column for each size:
library(tidyverse)
df <-
read_csv("https://dropbox.com/s/antzwk1jh4ldrhi/EarlyAccess_anon.csv?dl=1")
df |>
group_by(size) |>
summarise(
pc_var_stability = sum(
Improving.stability..generic..thermal..pH. == "Improving stability (generic, thermal, pH)",
na.rm = TRUE
) / n() * 100,
pc_var_activity = sum(Improving.activity == "Improving activity",
na.rm = TRUE) / n() * 100,
pc_var_binding.affinity = sum(
Improving.binding.affinity == "Improving binding affinity",
na.rm = TRUE
) / n() * 100
)
#> # A tibble: 7 × 4
#> size pc_var_stability pc_var_activity pc_var_binding.affinity
#> <chr> <dbl> <dbl> <dbl>
#> 1 1000-10000 43.5 47.8 34.8
#> 2 10000+ 65 65 70
#> 3 11-50 53.8 53.8 46.2
#> 4 2-10 51.1 46.8 46.8
#> 5 200-1000 64.7 52.9 52.9
#> 6 50-200 42.1 42.1 36.8
#> 7 Just me 48.5 39.4 54.5
Looking at your data, each column has either the string value you're testing for or NA, so you could make it even shorter/tidier just by counting non-NAs in relevant columns:
df |>
group_by(size) |>
summarise(across(
c(
Improving.stability..generic..thermal..pH.,
Improving.activity,
Improving.binding.affinity
),
\(val) 100 * sum(!is.na(val)) / n()
))
If what you're aiming to do is summarise across all columns then the latter method may work best - there are several ways of specifying which columns you want and so you don't necessarily need to type all names and values in. You might also find it clearest to make calculating and formatting all percentages a named function to call:
library(tidyverse)
df <-
read_csv("https://dropbox.com/s/antzwk1jh4ldrhi/EarlyAccess_anon.csv?dl=1",
show_col_types = FALSE)
perc_nonmissing <- function(val) {
sprintf("%.1f%%", 100 * sum(!is.na(val)) / n())
}
df |>
group_by(size) |>
summarise(across(-c(1:2), perc_nonmissing))
#> # A tibble: 7 × 12
#> size Disco…¹ Searc…² Under…³ Impro…⁴ Impro…⁵ Impro…⁶ Impro…⁷ Impro…⁸ Impro…⁹
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 1000-… 21.7% 17.4% 43.5% 47.8% 39.1% 43.5% 30.4% 39.1% 39.1%
#> 2 10000+ 40.0% 55.0% 55.0% 65.0% 70.0% 65.0% 20.0% 30.0% 40.0%
#> 3 11-50 30.8% 26.9% 42.3% 53.8% 38.5% 53.8% 15.4% 30.8% 38.5%
#> 4 2-10 38.3% 40.4% 48.9% 46.8% 36.2% 51.1% 23.4% 31.9% 42.6%
# etc.

How to read files in two separate lists in a function based on a condition in R

Okay, I hope I manage to sum up what I need to achieve. I am running experiments in which I obtain data from two different source, with a date_time being the matching unifying variable. The data in the two separate sources have the same structure (in csv or txt). The distinction is in the filenames. I provide an example below:
list_of_files <- structure(
list
(
solid_epoxy1_10 = data.frame(
date_time = c("20/07/2022 13:46",
"20/07/2022 13:56",
"20/07/2022 14:06"),
frequency = c("30000",
"31000",
"32000"),
index = c("1", "2", "3")
),
solid_otherpaint_20 = data.frame(
date_time = c("20/07/2022 13:10",
"20/07/2022 13:20",
"20/07/2022 14:30"),
frequency = c("20000",
"21000",
"22000"),
index = c("1", "2", "3")
),
water_epoxy1_10 = data.frame(
date_time = c("20/07/2022 13:46",
"20/07/2022 13:56",
"20/07/2022 14:06"),
temperature = c("22.3",
"22.6",
"22.5")
),
water_otherpaint_20 = data.frame(
date_time = c("20/07/2022 13:10",
"20/07/2022 13:20",
"20/07/2022 14:30"),
temperature = c("24.5",
"24.6",
"24.8")
)
)
)
First I want to read the data files in two separate lists. One that contains the keyword "solid" in the file name, and the other one that contains "water".
Then I need to create a new columns from the filename in each data frame that will be separated by "_" (e.g paint = "epox1", thickness = "10"), by which I could do an inner join by the date_time column, paint, thickness,etc. Basically what I struggle so far is to create a function that loads that files in two separate lists. This is what I've tried so far
load_files <-
function(list_of_files) {
all.files.board <- list()
all.files.temp <- list()
for (i in 1:length(list_of_files))
{
if (exists("board")) {
all.files.board[[i]] = fread(list_of_files[i])
}
else{
all.files.temp[[i]] = fread(list_of_files[i])
}
return(list(all.files.board, all.files.temp))
}
}
But it doesn't do what I need it. I hope I made it as clear as possible. I'm pretty comfortable with the tidyverse package but writing still a newbie in writing custom functions. Any ideas welcomed.
Regarding question in the title -
first issue, calling return() too early and thus breaking a for-loop, was already mentioned in comments and that should be sorted.
next one is condition itself, if (exists("board")){} checks if there is an object called board; in provided sample it would evaluate to TRUE only if something was assigned to global board object before calling load_files() function and it would evaluate to FALSE only if there were no such assignment or board was explicitly removed. I.e. with
board <- "something"; dataframes <- load_files(file_list) that check will be TRUE while with
rm(board); dataframes <- load_files(file_list) it will be FALSE, there's nothing in function itself that would change the "existance" of board, so the result is actually determined before calling the function.
If core of the question is about joining 2 somewhat different datasets and splitting result by groups, I'd just drop loops, conditions and most of involved lists and would go with something like this with Tidyverse:
library(fs)
library(readr)
library(stringr)
library(dplyr)
library(tidyr)
# prepare input files for sample ------------------------------------------
sample_dfs <- structure(
list
(
solid_epoxy1_10 = data.frame(
date_time = c("20/07/2022 13:46", "20/07/2022 13:56", "20/07/2022 14:06"),
frequency = c("30000", "31000", "32000"),
index = c("1", "2", "3")
),
solid_otherpaint_20 = data.frame(
date_time = c("20/07/2022 13:10", "20/07/2022 13:20", "20/07/2022 14:30"),
frequency = c("20000", "21000", "22000"),
index = c("1", "2", "3")
),
water_epoxy1_10 = data.frame(
date_time = c("20/07/2022 13:46", "20/07/2022 13:56", "20/07/2022 14:06"),
temperature = c("22.3", "22.6", "22.5")
),
water_otherpaint_20 = data.frame(
date_time = c("20/07/2022 13:10", "20/07/2022 13:20", "20/07/2022 14:30"),
temperature = c("24.5", "24.6", "24.8")
)
)
)
tmp_path <- file_temp("reprex")
dir_create(tmp_path)
sample_filenames <- str_glue("{1:length(sample_dfs)}_{names(sample_dfs)}.csv")
for (i in seq_along(sample_dfs)) {
write_csv(sample_dfs[[i]], path(tmp_path, sample_filenames[i]))
}
dir_ls(tmp_path, type = "file")
#> Temp/RtmpqUoct8/reprex5cc517f177b/1_solid_epoxy1_10.csv
#> Temp/RtmpqUoct8/reprex5cc517f177b/2_solid_otherpaint_20.csv
#> Temp/RtmpqUoct8/reprex5cc517f177b/3_water_epoxy1_10.csv
#> Temp/RtmpqUoct8/reprex5cc517f177b/4_water_otherpaint_20.csv
# read files --------------------------------------------------------------
t_solid <- dir_ls(tmp_path, glob = "*solid*.csv", type = "file") %>%
read_csv(id = "filename") %>%
extract(filename, c("paint", "thickness"), "_([^_]+)_(\\d+)\\.csv")
t_solid
#> # A tibble: 6 × 5
#> paint thickness date_time frequency index
#> <chr> <chr> <chr> <dbl> <dbl>
#> 1 epoxy1 10 20/07/2022 13:46 30000 1
#> 2 epoxy1 10 20/07/2022 13:56 31000 2
#> 3 epoxy1 10 20/07/2022 14:06 32000 3
#> 4 otherpaint 20 20/07/2022 13:10 20000 1
#> 5 otherpaint 20 20/07/2022 13:20 21000 2
#> 6 otherpaint 20 20/07/2022 14:30 22000 3
t_water <- dir_ls(tmp_path, glob = "*water*.csv", type = "file") %>%
read_csv(id = "filename") %>%
extract(filename, c("paint", "thickness"), "_([^_]+)_(\\d+)\\.csv")
t_water
#> # A tibble: 6 × 4
#> paint thickness date_time temperature
#> <chr> <chr> <chr> <dbl>
#> 1 epoxy1 10 20/07/2022 13:46 22.3
#> 2 epoxy1 10 20/07/2022 13:56 22.6
#> 3 epoxy1 10 20/07/2022 14:06 22.5
#> 4 otherpaint 20 20/07/2022 13:10 24.5
#> 5 otherpaint 20 20/07/2022 13:20 24.6
#> 6 otherpaint 20 20/07/2022 14:30 24.8
# or implement as a function ----------------------------------------------
load_files <- function(csv_path, glob = "*.csv") {
return(
dir_ls(csv_path, glob = glob, type = "file") %>%
# store filenames in filename column
read_csv(id = "filename", show_col_types = FALSE) %>%
# extract each regex group to its own column
extract(filename, c("paint", "thickness"), "_([^_]+)_(\\d+)\\.csv"))
}
# join / group / split ----------------------------------------------------
t_solid <- load_files(tmp_path, "*solid*.csv")
t_water <- load_files(tmp_path, "*water*.csv")
# either join by multiple columns or select only required cols
# to avoid x.* & y.* columns in result
inner_join(t_solid, t_water, by = c("date_time", "paint", "thickness")) %>%
group_by(paint) %>%
group_split()
Final result as a list of tibbles:
#> <list_of<
#> tbl_df<
#> paint : character
#> thickness : character
#> date_time : character
#> frequency : double
#> index : double
#> temperature: double
#> >
#> >[2]>
#> [[1]]
#> # A tibble: 3 × 6
#> paint thickness date_time frequency index temperature
#> <chr> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 epoxy1 10 20/07/2022 13:46 30000 1 22.3
#> 2 epoxy1 10 20/07/2022 13:56 31000 2 22.6
#> 3 epoxy1 10 20/07/2022 14:06 32000 3 22.5
#>
#> [[2]]
#> # A tibble: 3 × 6
#> paint thickness date_time frequency index temperature
#> <chr> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 otherpaint 20 20/07/2022 13:10 20000 1 24.5
#> 2 otherpaint 20 20/07/2022 13:20 21000 2 24.6
#> 3 otherpaint 20 20/07/2022 14:30 22000 3 24.8

R - reduce multilevel list and add id variable at each step

Let's say that you have a list in R named Weather which is structured like this:
- Winter
- Europe
- Rainfall
- Temperature
- Asia
- Rainfall
- Temperature
- Africa
- Rainfall
- Temperature
- Summer
- Europe
- Rainfall
- Temperature
- Asia
- Rainfall
- Temperature
- Africa
- Rainfall
- Temperature
Each of the objects stored in Rainfall or Temperature is a data frame which contains two variables: Date and Temp in case of Temperature and Date and Rain in the case of Rainfall.
I want to turn this list of lists into a data frame which has the following variables: Date, Temp, Rain, Continent, Season.
I don't understand how can I automatically add the ID variable to the lower levels of the list, pulling the names from the lower levels of the list. For now I have to do it manually like this:
Weather$Winter$Europe$Rainfall$Continent <- "Europe"
Weather$Winter$Europe$Temperature$Continent <- "Europe"
Weather$Winter$Asia$Rainfall$Continent <- "Asia"
Weather$Winter$Asia$Temperature$Continent <- "Asia"
Weather$Winter$Africa$Rainfall$Continent <- "Africa"
Weather$Winter$Africa$Temperature$Continent <- "Africa"
Weather$Summer$Europe$Rainfall$Continent <- "Europe"
Weather$Summer$Europe$Temperature$Continent <- "Europe"
Weather$Summer$Asia$Rainfall$Continent <- "Asia"
Weather$Summer$Asia$Temperature$Continent <- "Asia"
Weather$Summer$Africa$Rainfall$Continent <- "Africa"
Weather$Summer$Africa$Temperature$Continent <- "Africa"
Then I use map_depth and reduce to join the lists. After that, I have to repeat this process to manually add the Season variable, and only then am I able to reduce all the data frames into a single one.
I am looking for a method that would allow me to automatically create the ID variables (Continent, Season), which are actually pulled from list names, while reducing the list.
You can use this as a starting point:
library(tidyverse)
data <- list(
Winter = list(
Europe = list(
Rainfall = data.frame(Date = c("2022-06-01", "2022-06-02", "2022-06-03"), Rain = c(20, 10, 15)),
Temperature = data.frame(Date = c("2022-06-01", "2022-06-02", "2022-06-03"), Temp = c(0, 0.5, 0.8))
),
Asia = list(
Rainfall = data.frame(Date = c("2022-06-01", "2022-06-02", "2022-06-03"), Rain = c(30, 35, 34)),
Temperature = data.frame(Date = c("2022-06-01", "2022-06-02", "2022-06-03"), Temp = c(0, 0, 0.1))
)
)
)
data
#> $Winter
#> $Winter$Europe
#> $Winter$Europe$Rainfall
#> Date Rain
#> 1 2022-06-01 20
#> 2 2022-06-02 10
#> 3 2022-06-03 15
#>
#> $Winter$Europe$Temperature
#> Date Temp
#> 1 2022-06-01 0.0
#> 2 2022-06-02 0.5
#> 3 2022-06-03 0.8
#>
#>
#> $Winter$Asia
#> $Winter$Asia$Rainfall
#> Date Rain
#> 1 2022-06-01 30
#> 2 2022-06-02 35
#> 3 2022-06-03 34
#>
#> $Winter$Asia$Temperature
#> Date Temp
#> 1 2022-06-01 0.0
#> 2 2022-06-02 0.0
#> 3 2022-06-03 0.1
data %>%
rapply(as.list) %>%
enframe() %>%
separate(name, into = c("Season", "Continent", "table", "name"), sep = "[.]") %>%
mutate(name = name %>% str_remove("[0-9]+$")) %>%
select(-table) %>%
pivot_wider(values_fn = list) %>%
unnest(Date) %>%
unnest(Rain) %>%
unnest(Temp)
#> # A tibble: 108 × 5
#> Season Continent Date Rain Temp
#> <chr> <chr> <chr> <chr> <chr>
#> 1 Winter Europe 2022-06-01 20 0
#> 2 Winter Europe 2022-06-01 20 0.5
#> 3 Winter Europe 2022-06-01 20 0.8
#> 4 Winter Europe 2022-06-01 10 0
#> 5 Winter Europe 2022-06-01 10 0.5
#> 6 Winter Europe 2022-06-01 10 0.8
#> 7 Winter Europe 2022-06-01 15 0
#> 8 Winter Europe 2022-06-01 15 0.5
#> 9 Winter Europe 2022-06-01 15 0.8
#> 10 Winter Europe 2022-06-02 20 0
#> # … with 98 more rows
Created on 2022-06-28 by the reprex package (v2.0.0)

Translating a `for loop` into purrr using `map` function in r

I need to download weather data from NASA’s POWER (Prediction Of Worldwide Energy Resource). The package nasapower is a package developed for data retrieval using R. I need to download many locations (lat, long coordinates). To do this I tried a simple loop with three locations as a reproducible example.
library(nasapower)
data1 <- read.csv(text = "
location,long,lat
loc1, -56.547, -14.2427
loc2, -57.547, -15.2427
loc3, -58.547, -16.2427")
i=1
all.weather <- data.frame()
for (i in seq_along(1:nrow(data1))) {
weather.data <- get_power(community = "AG",
lonlat = c(data1$long[i],data1$lat[i]),
dates = c("2015-01-01", "2015-01-10"),
temporal_average = "DAILY",
pars = c("T2M_MAX"))
all.weather <-rbind(all.weather, weather.data)
}
This works perfect. The problem is that I am trying to mimic this using purrr::map since I want to have an alternative within tidyverse. This is what I did but it does not work:
library(dplyr)
library(purrr)
all.weather <- data1 %>%
group_by(location) %>%
map(get_power(community = "AG",
lonlat = c(long, lat),
dates = c("2015-01-01", "2015-01-10"),
temporal_average = "DAILY",
site_elevation = NULL,
pars = c("T2M_MAX")))
I got the following error:
Error in isFALSE(length(lonlat != 2)) : object 'long' not found
Any hint on how to run this using purrr?
To make your code work make use of purrr::pmap instead of map like so:
map is for one argument functions, map2 for two argument funs and pmap is the most general one allowing for funs with more than two arguments.
pmap will loop over the rows of your df. As your df has 3 columns 3 arguments are passed to the function, even if the first argument location is not used. To make this work and to make use of the column names you have to specify the function and the argument names via function(location, long, lat)
library(nasapower)
data1 <- read.csv(text = "
location,long,lat
loc1, -56.547, -14.2427
loc2, -57.547, -15.2427
loc3, -58.547, -16.2427")
library(dplyr)
library(purrr)
all.weather <- data1 %>%
pmap(function(location, long, lat) get_power(community = "AG",
lonlat = c(long, lat),
dates = c("2015-01-01", "2015-01-10"),
temporal_average = "DAILY",
site_elevation = NULL,
pars = c("T2M_MAX"))) %>%
# Name list with locations
setNames(data1$location) %>%
# Add location names as identifiers
bind_rows(.id = "location")
head(all.weather)
#> NASA/POWER SRB/FLASHFlux/MERRA2/GEOS 5.12.4 (FP-IT) 0.5 x 0.5 Degree Daily Averaged Data
#> Dates (month/day/year): 01/01/2015 through 01/10/2015
#> Location: Latitude -14.2427 Longitude -56.547
#> Elevation from MERRA-2: Average for 1/2x1/2 degree lat/lon region = 379.25 meters Site = na
#> Climate zone: na (reference Briggs et al: http://www.energycodes.gov)
#> Value for missing model data cannot be computed or out of model availability range: NA
#>
#> Parameters:
#> T2M_MAX MERRA2 1/2x1/2 Maximum Temperature at 2 Meters (C)
#>
#> # A tibble: 6 x 9
#> location LON LAT YEAR MM DD DOY YYYYMMDD T2M_MAX
#> <chr> <dbl> <dbl> <dbl> <int> <int> <int> <date> <dbl>
#> 1 loc1 -56.5 -14.2 2015 1 1 1 2015-01-01 29.9
#> 2 loc1 -56.5 -14.2 2015 1 2 2 2015-01-02 30.1
#> 3 loc1 -56.5 -14.2 2015 1 3 3 2015-01-03 27.3
#> 4 loc1 -56.5 -14.2 2015 1 4 4 2015-01-04 28.7
#> 5 loc1 -56.5 -14.2 2015 1 5 5 2015-01-05 30
#> 6 loc1 -56.5 -14.2 2015 1 6 6 2015-01-06 28.7

Use mutate inside a function called by an apply family function

I am trying to change some of my data that are stored as tibbles inside a list.
This list of tibbles was generated by a package.
I do not understand why my function does not work.
If I extract a tibble element manually, the function works but not inside a lapply.
my function:
changesomethingtaxize <- function(x, whatchange=NULL, applyfunction=NULL){
library(lazyeval) ;
mutate_call <- lazyeval::interp(~ a(b), a = match.fun(applyfunction), b = as.name(whatchange) )
x %<>% mutate_(.dots = setNames(list(mutate_call), whatchange) )
return(x)
}
I want to do
mydata <- lapply(mydata, function(x) changesomethingtaxize(x, whatchange=rank, applyfunction=str_to_sentence) )
I could use a loop to extract each tibbles (in this case I only have 5) but I would like to understand what I do wrong :)
From dput()
mydata <- structure(list(`Zostera marina` = structure(list(name = c("Plantae",
"Viridiplantae", "Streptophyta", "Embryophyta", "Tracheophyta",
"Spermatophytina", "Magnoliopsida", "Lilianae", "Alismatales",
"Zosteraceae", "Zostera", "Zostera marina"), rank = c("kingdom",
"subkingdom", "infrakingdom", "superdivision", "division", "subdivision",
"class", "superorder", "order", "family", "genus", "species"),
id = c("202422", "954898", "846494", "954900", "846496",
"846504", "18063", "846542", "38883", "39069", "39073", "39074"
)), row.names = c(NA, 12L), class = "data.frame"), `Vascular plants` = structure(list(
name = c("Plantae", "Viridiplantae", "Streptophyta", "Embryophyta",
"Tracheophyta"), rank = c("kingdom", "subkingdom", "infrakingdom",
"superdivision", "division"), id = c("202422", "954898",
"846494", "954900", "846496")), row.names = c(NA, 5L), class = "data.frame"),
`Fucus vesiculosus` = structure(list(name = c("Chromista",
"Chromista", "Phaeophyta", "Phaeophyceae", "Fucales", "Fucaceae",
"Fucus", "Fucus vesiculosus"), rank = c("kingdom", "subkingdom",
"division", "class", "order", "family", "genus", "species"
), id = c("630578", "590735", "660055", "10686", "11328",
"11329", "11334", "11335")), row.names = c(NA, 8L), class = "data.frame"),
Macroalgae = NA, `Filamentous algae` = NA), class = "classification", db = "itis")
I think I actually found why... :D
The lapply works but was not returning anything because of the NAs (empty elements of the list).
I added an if() that only mutates a tibble if the tibble actually contains something.
It is always an NA issue somewhere!
Well hope that piece of code could help someone someday.
The functions you provided aren't usable by themselves, but it looks like you're attempting to use a function meant to modify a data frame on non-dataframe objects, which mydata contains.
I'm using dplyr::mutate() just to illustrate here.
Your data contain NAs (which in this case are logical). dplyr::mutate() doesnt' have a method for logicals and I'm assuming the function you're trying to use doesn't either (or simply doesn't have a way of handling NA values).
You should be getting an error that's at least conceptually similar to the following ...
lapply(mydata, function(x) dplyr::mutate(x, col_to_modify = toupper(rank)))
#> Error in UseMethod("mutate_"): no applicable method for 'mutate_' applied to an object of class "logical"
To get around this, you can check your list ahead of time and note which elements are indeed data frames.
df_indices <- vapply(mydata, is.data.frame, logical(1L))
df_indices
#> Zostera marina Vascular plants Fucus vesiculosus Macroalgae
#> TRUE TRUE TRUE FALSE
#> Filamentous algae
#> FALSE
Using df_indices, we can modify only those elements in mydata like so...
mydata[df_indices] <- lapply(
mydata[df_indices],
function(x) dplyr::mutate(x, col_to_modify = toupper(rank))
)
mydata
#> $`Zostera marina`
#> name rank id col_to_modify
#> 1 Plantae kingdom 202422 KINGDOM
#> 2 Viridiplantae subkingdom 954898 SUBKINGDOM
#> 3 Streptophyta infrakingdom 846494 INFRAKINGDOM
#> 4 Embryophyta superdivision 954900 SUPERDIVISION
#> 5 Tracheophyta division 846496 DIVISION
#> 6 Spermatophytina subdivision 846504 SUBDIVISION
#> 7 Magnoliopsida class 18063 CLASS
#> 8 Lilianae superorder 846542 SUPERORDER
#> 9 Alismatales order 38883 ORDER
#> 10 Zosteraceae family 39069 FAMILY
#> 11 Zostera genus 39073 GENUS
#> 12 Zostera marina species 39074 SPECIES
#>
#> $`Vascular plants`
#> name rank id col_to_modify
#> 1 Plantae kingdom 202422 KINGDOM
#> 2 Viridiplantae subkingdom 954898 SUBKINGDOM
#> 3 Streptophyta infrakingdom 846494 INFRAKINGDOM
#> 4 Embryophyta superdivision 954900 SUPERDIVISION
#> 5 Tracheophyta division 846496 DIVISION
#>
#> $`Fucus vesiculosus`
#> name rank id col_to_modify
#> 1 Chromista kingdom 630578 KINGDOM
#> 2 Chromista subkingdom 590735 SUBKINGDOM
#> 3 Phaeophyta division 660055 DIVISION
#> 4 Phaeophyceae class 10686 CLASS
#> 5 Fucales order 11328 ORDER
#> 6 Fucaceae family 11329 FAMILY
#> 7 Fucus genus 11334 GENUS
#> 8 Fucus vesiculosus species 11335 SPECIES
#>
#> $Macroalgae
#> [1] NA
#>
#> $`Filamentous algae`
#> [1] NA
#>
#> attr(,"class")
#> [1] "classification"
#> attr(,"db")
#> [1] "itis"
Note that {purrr} has a nice map() variant designed to handle this very situation. purrr::map_if() takes a .p (predicate) argument to which you can provide a function that it applies to .x and returns TRUE or FALSE. Only those elements that return TRUE are modified by the function you provide to .f
purrr::map_if(.x = mydata, .p = is.data.frame,
.f = ~ dplyr::mutate(.x, col_to_modify = toupper(rank)))
#> $`Zostera marina`
#> name rank id col_to_modify
#> 1 Plantae kingdom 202422 KINGDOM
#> 2 Viridiplantae subkingdom 954898 SUBKINGDOM
#> 3 Streptophyta infrakingdom 846494 INFRAKINGDOM
#> 4 Embryophyta superdivision 954900 SUPERDIVISION
#> 5 Tracheophyta division 846496 DIVISION
#> 6 Spermatophytina subdivision 846504 SUBDIVISION
#> 7 Magnoliopsida class 18063 CLASS
#> 8 Lilianae superorder 846542 SUPERORDER
#> 9 Alismatales order 38883 ORDER
#> 10 Zosteraceae family 39069 FAMILY
#> 11 Zostera genus 39073 GENUS
#> 12 Zostera marina species 39074 SPECIES
#>
#> $`Vascular plants`
#> name rank id col_to_modify
#> 1 Plantae kingdom 202422 KINGDOM
#> 2 Viridiplantae subkingdom 954898 SUBKINGDOM
#> 3 Streptophyta infrakingdom 846494 INFRAKINGDOM
#> 4 Embryophyta superdivision 954900 SUPERDIVISION
#> 5 Tracheophyta division 846496 DIVISION
#>
#> $`Fucus vesiculosus`
#> name rank id col_to_modify
#> 1 Chromista kingdom 630578 KINGDOM
#> 2 Chromista subkingdom 590735 SUBKINGDOM
#> 3 Phaeophyta division 660055 DIVISION
#> 4 Phaeophyceae class 10686 CLASS
#> 5 Fucales order 11328 ORDER
#> 6 Fucaceae family 11329 FAMILY
#> 7 Fucus genus 11334 GENUS
#> 8 Fucus vesiculosus species 11335 SPECIES
#>
#> $Macroalgae
#> [1] NA
#>
#> $`Filamentous algae`
#> [1] NA

Resources