I am validating two data frames if they are consistent, its working on small dataframes perfectly but when records of data frame increases then it shows error
library(tidyverse)
df1 <- data.frame(MAN=c(6,6,4,6,8,6,8,4,4,6,6,8,8),MANi=c("OD","NY","CA","CA","OD","CA","OD","NY","OL","NY","OD","CA","OD"),
nune=c("akas","mani","juna","mau","nuh","kil","kman","nuha","huna","kman","nuha","huna","mani"),
klay=c(1,2,2,1,1,2,1,2,1,2,1,1,2),emial=c("dd","xyz","abc","dd","xyz","abc","dd","xyz","abc","dd","xyz","abc","dd"),Pass=c("Low","High","Low","Low","High","Low","High","High","Low","High","High","High","Low"),fri=c("KKK","USA","IND","SRI","PAK","CHI","JYP","TGA","KKK","USA","IND","SRI","PAK"),
mkl=c("m","f","m","m","f","m","m","f","m","m","f","m","m"),kin=c("Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Rec","Sent","Rec"),munc=c("Car","Bus","Truk","Cyl","Bus","Car","Bus","Bus","Bus","Car","Car","Cyl","Car"),
lone=c("Sr","jun","sr","jun","man","man","jr","Sr","jun","sr","jun","man","man"),wond=c("tko","kent","bho","kilt","kent","bho","kent","bho","bho","kilt","kent","bho","kilt"))
df2 <- data.frame(MAN=c(6,6,4,6,8,6,8,4,4,6,6,8,8,8,6),MANi=c("OD","NY","CA","CA","OD","CA","OD","NY","OL","ny","OD","CA","OD","NY","OL"),
nune=c("akas","mani","juna","mau","nuh","kil","kman","nuha","huna","kman","nuha","huna","mani","juna","mau"),
klay=c(1,2,2,1,1,2,1,2,1,2,1,1,2,2,1),emial=c("dd","xyz","ABC","dd","xyz","ABC","dd","xyz","ABC","dd","xyz","ABC","dd","xyz","ABC"),Pass=c("Low","High","Low","Low","High","Low","High","High","Low","High","High","High","Low","High","High"),fri=c("KKK","USA","IND","SRI","PAK","CHI","JYP","TGA","KKK","USA","IND","SRI","PAK","CHI","JYP"),
mkl=c("male","female","male","male","female","male","male","female","male","male","female","male","male","female","male"),kin=c("Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Rec","Sent","Rec","Sent","Rec"),munc=c("Car","Bus","Truk","Cyl","Bus","Car","Bus","Bus","Bus","Car","Car","Cyl","Car","Bus","Bus"),
lone=c("Sr","jun","sr","jun","man","man","jr","Sr","jun","sr","jun","man","man","jr","man"),wond=c("tko","kent","bho","kilt","kent","bho","kent","bho","bho","kilt","kent","bho","kilt","kent","bho"))
Worth considering waldo::compare?
df1 <- data.frame(MAN=c(6,6,4,6,8,6,8,4,4,6,6,8,8),MANi=c("OD","NY","CA","CA","OD","CA","OD","NY","OL","NY","OD","CA","OD"),
nune=c("akas","mani","juna","mau","nuh","kil","kman","nuha","huna","kman","nuha","huna","mani"),
klay=c(1,2,2,1,1,2,1,2,1,2,1,1,2),emial=c("dd","xyz","abc","dd","xyz","abc","dd","xyz","abc","dd","xyz","abc","dd"),Pass=c("Low","High","Low","Low","High","Low","High","High","Low","High","High","High","Low"),fri=c("KKK","USA","IND","SRI","PAK","CHI","JYP","TGA","KKK","USA","IND","SRI","PAK"),
mkl=c("m","f","m","m","f","m","m","f","m","m","f","m","m"),kin=c("Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Rec","Sent","Rec"),munc=c("Car","Bus","Truk","Cyl","Bus","Car","Bus","Bus","Bus","Car","Car","Cyl","Car"),
lone=c("Sr","jun","sr","jun","man","man","jr","Sr","jun","sr","jun","man","man"),wond=c("tko","kent","bho","kilt","kent","bho","kent","bho","bho","kilt","kent","bho","kilt"))
df2 <- data.frame(MAN=c(6,6,4,6,8,6,8,4,4,6,6,8,8,8,6),MANi=c("OD","NY","CA","CA","OD","CA","OD","NY","OL","ny","OD","CA","OD","NY","OL"),
nune=c("akas","mani","juna","mau","nuh","kil","kman","nuha","huna","kman","nuha","huna","mani","juna","mau"),
klay=c(1,2,2,1,1,2,1,2,1,2,1,1,2,2,1),emial=c("dd","xyz","ABC","dd","xyz","ABC","dd","xyz","ABC","dd","xyz","ABC","dd","xyz","ABC"),Pass=c("Low","High","Low","Low","High","Low","High","High","Low","High","High","High","Low","High","High"),fri=c("KKK","USA","IND","SRI","PAK","CHI","JYP","TGA","KKK","USA","IND","SRI","PAK","CHI","JYP"),
mkl=c("male","female","male","male","female","male","male","female","male","male","female","male","male","female","male"),kin=c("Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Rec","Sent","Rec","Sent","Rec"),munc=c("Car","Bus","Truk","Cyl","Bus","Car","Bus","Bus","Bus","Car","Car","Cyl","Car","Bus","Bus"),
lone=c("Sr","jun","sr","jun","man","man","jr","Sr","jun","sr","jun","man","man","jr","man"),wond=c("tko","kent","bho","kilt","kent","bho","kent","bho","bho","kilt","kent","bho","kilt","kent","bho"))
waldo::compare(df1, df2)
#> `attr(old, 'row.names')[11:13]`: 11 12 13
#> `attr(new, 'row.names')[11:15]`: 11 12 13 14 15
#>
#> old vs new
#> MAN MANi nune klay emial Pass fri mkl kin munc lone wond
#> - old[1, ] 6 OD akas 1 dd Low KKK m Sent Car Sr tko
#> + new[1, ] 6 OD akas 1 dd Low KKK male Sent Car Sr tko
#> - old[2, ] 6 NY mani 2 xyz High USA f Rec Bus jun kent
#> + new[2, ] 6 NY mani 2 xyz High USA female Rec Bus jun kent
#> - old[3, ] 4 CA juna 2 abc Low IND m Sent Truk sr bho
#> + new[3, ] 4 CA juna 2 ABC Low IND male Sent Truk sr bho
#> - old[4, ] 6 CA mau 1 dd Low SRI m Rec Cyl jun kilt
#> + new[4, ] 6 CA mau 1 dd Low SRI male Rec Cyl jun kilt
#> - old[5, ] 8 OD nuh 1 xyz High PAK f Sent Bus man kent
#> + new[5, ] 8 OD nuh 1 xyz High PAK female Sent Bus man kent
#> - old[6, ] 6 CA kil 2 abc Low CHI m Rec Car man bho
#> + new[6, ] 6 CA kil 2 ABC Low CHI male Rec Car man bho
#> - old[7, ] 8 OD kman 1 dd High JYP m Sent Bus jr kent
#> + new[7, ] 8 OD kman 1 dd High JYP male Sent Bus jr kent
#> - old[8, ] 4 NY nuha 2 xyz High TGA f Rec Bus Sr bho
#> + new[8, ] 4 NY nuha 2 xyz High TGA female Rec Bus Sr bho
#> - old[9, ] 4 OL huna 1 abc Low KKK m Sent Bus jun bho
#> + new[9, ] 4 OL huna 1 ABC Low KKK male Sent Bus jun bho
#> - old[10, ] 6 NY kman 2 dd High USA m Rec Car sr kilt
#> + new[10, ] 6 ny kman 2 dd High USA male Rec Car sr kilt
#> and 5 more ...
#>
#> `old$MAN[11:13]`: 6 8 8
#> `new$MAN[11:15]`: 6 8 8 8 6
#>
#> `old$MANi[10:13]`: "NY" "OD" "CA" "OD"
#> `new$MANi[7:15]`: "OD" "NY" "OL" "ny" "OD" "CA" "OD" "NY" "OL"
#>
#> `old$nune[11:13]`: "nuha" "huna" "mani"
#> `new$nune[11:15]`: "nuha" "huna" "mani" "juna" "mau"
#>
#> `old$klay[11:13]`: 1 1 2
#> `new$klay[11:15]`: 1 1 2 2 1
#>
#> old$emial | new$emial
#> [2] "xyz" - "dd" [1]
#> [3] "abc" - "xyz" [2]
#> [4] "dd" - "ABC" [3]
#> [5] "xyz" - "dd" [4]
#> [6] "abc" - "xyz" [5]
#> [7] "dd" - "ABC" [6]
#> [8] "xyz" - "dd" [7]
#> [9] "abc" - "xyz" [8]
#> [10] "dd" - "ABC" [9]
#> [11] "xyz" - "dd" [10]
#> ... ... ... and 5 more ...
#>
#> `old$Pass[11:13]`: "High" "High" "Low"
#> `new$Pass[11:15]`: "High" "High" "Low" "High" "High"
#>
#> `old$fri[11:13]`: "IND" "SRI" "PAK"
#> `new$fri[11:15]`: "IND" "SRI" "PAK" "CHI" "JYP"
#>
#> old$mkl | new$mkl
#> [1] "m" - "male" [1]
#> [2] "f" - "female" [2]
#> [3] "m" - "male" [3]
#> [4] "m" - "male" [4]
#> [5] "f" - "female" [5]
#> [6] "m" - "male" [6]
#> [7] "m" - "male" [7]
#> [8] "f" - "female" [8]
#> [9] "m" - "male" [9]
#> [10] "m" - "male" [10]
#> ... ... ... and 5 more ...
#>
#> And 4 more differences ...
Created on 2022-05-21 by the reprex package (v2.0.1)
Or the daff package for highlighted sortable / filterable differences:
library(daff)
diffs <- diff_data(df1, df2)
render_diff(diffs)
I am trying to change some of my data that are stored as tibbles inside a list.
This list of tibbles was generated by a package.
I do not understand why my function does not work.
If I extract a tibble element manually, the function works but not inside a lapply.
my function:
changesomethingtaxize <- function(x, whatchange=NULL, applyfunction=NULL){
library(lazyeval) ;
mutate_call <- lazyeval::interp(~ a(b), a = match.fun(applyfunction), b = as.name(whatchange) )
x %<>% mutate_(.dots = setNames(list(mutate_call), whatchange) )
return(x)
}
I want to do
mydata <- lapply(mydata, function(x) changesomethingtaxize(x, whatchange=rank, applyfunction=str_to_sentence) )
I could use a loop to extract each tibbles (in this case I only have 5) but I would like to understand what I do wrong :)
From dput()
mydata <- structure(list(`Zostera marina` = structure(list(name = c("Plantae",
"Viridiplantae", "Streptophyta", "Embryophyta", "Tracheophyta",
"Spermatophytina", "Magnoliopsida", "Lilianae", "Alismatales",
"Zosteraceae", "Zostera", "Zostera marina"), rank = c("kingdom",
"subkingdom", "infrakingdom", "superdivision", "division", "subdivision",
"class", "superorder", "order", "family", "genus", "species"),
id = c("202422", "954898", "846494", "954900", "846496",
"846504", "18063", "846542", "38883", "39069", "39073", "39074"
)), row.names = c(NA, 12L), class = "data.frame"), `Vascular plants` = structure(list(
name = c("Plantae", "Viridiplantae", "Streptophyta", "Embryophyta",
"Tracheophyta"), rank = c("kingdom", "subkingdom", "infrakingdom",
"superdivision", "division"), id = c("202422", "954898",
"846494", "954900", "846496")), row.names = c(NA, 5L), class = "data.frame"),
`Fucus vesiculosus` = structure(list(name = c("Chromista",
"Chromista", "Phaeophyta", "Phaeophyceae", "Fucales", "Fucaceae",
"Fucus", "Fucus vesiculosus"), rank = c("kingdom", "subkingdom",
"division", "class", "order", "family", "genus", "species"
), id = c("630578", "590735", "660055", "10686", "11328",
"11329", "11334", "11335")), row.names = c(NA, 8L), class = "data.frame"),
Macroalgae = NA, `Filamentous algae` = NA), class = "classification", db = "itis")
I think I actually found why... :D
The lapply works but was not returning anything because of the NAs (empty elements of the list).
I added an if() that only mutates a tibble if the tibble actually contains something.
It is always an NA issue somewhere!
Well hope that piece of code could help someone someday.
The functions you provided aren't usable by themselves, but it looks like you're attempting to use a function meant to modify a data frame on non-dataframe objects, which mydata contains.
I'm using dplyr::mutate() just to illustrate here.
Your data contain NAs (which in this case are logical). dplyr::mutate() doesnt' have a method for logicals and I'm assuming the function you're trying to use doesn't either (or simply doesn't have a way of handling NA values).
You should be getting an error that's at least conceptually similar to the following ...
lapply(mydata, function(x) dplyr::mutate(x, col_to_modify = toupper(rank)))
#> Error in UseMethod("mutate_"): no applicable method for 'mutate_' applied to an object of class "logical"
To get around this, you can check your list ahead of time and note which elements are indeed data frames.
df_indices <- vapply(mydata, is.data.frame, logical(1L))
df_indices
#> Zostera marina Vascular plants Fucus vesiculosus Macroalgae
#> TRUE TRUE TRUE FALSE
#> Filamentous algae
#> FALSE
Using df_indices, we can modify only those elements in mydata like so...
mydata[df_indices] <- lapply(
mydata[df_indices],
function(x) dplyr::mutate(x, col_to_modify = toupper(rank))
)
mydata
#> $`Zostera marina`
#> name rank id col_to_modify
#> 1 Plantae kingdom 202422 KINGDOM
#> 2 Viridiplantae subkingdom 954898 SUBKINGDOM
#> 3 Streptophyta infrakingdom 846494 INFRAKINGDOM
#> 4 Embryophyta superdivision 954900 SUPERDIVISION
#> 5 Tracheophyta division 846496 DIVISION
#> 6 Spermatophytina subdivision 846504 SUBDIVISION
#> 7 Magnoliopsida class 18063 CLASS
#> 8 Lilianae superorder 846542 SUPERORDER
#> 9 Alismatales order 38883 ORDER
#> 10 Zosteraceae family 39069 FAMILY
#> 11 Zostera genus 39073 GENUS
#> 12 Zostera marina species 39074 SPECIES
#>
#> $`Vascular plants`
#> name rank id col_to_modify
#> 1 Plantae kingdom 202422 KINGDOM
#> 2 Viridiplantae subkingdom 954898 SUBKINGDOM
#> 3 Streptophyta infrakingdom 846494 INFRAKINGDOM
#> 4 Embryophyta superdivision 954900 SUPERDIVISION
#> 5 Tracheophyta division 846496 DIVISION
#>
#> $`Fucus vesiculosus`
#> name rank id col_to_modify
#> 1 Chromista kingdom 630578 KINGDOM
#> 2 Chromista subkingdom 590735 SUBKINGDOM
#> 3 Phaeophyta division 660055 DIVISION
#> 4 Phaeophyceae class 10686 CLASS
#> 5 Fucales order 11328 ORDER
#> 6 Fucaceae family 11329 FAMILY
#> 7 Fucus genus 11334 GENUS
#> 8 Fucus vesiculosus species 11335 SPECIES
#>
#> $Macroalgae
#> [1] NA
#>
#> $`Filamentous algae`
#> [1] NA
#>
#> attr(,"class")
#> [1] "classification"
#> attr(,"db")
#> [1] "itis"
Note that {purrr} has a nice map() variant designed to handle this very situation. purrr::map_if() takes a .p (predicate) argument to which you can provide a function that it applies to .x and returns TRUE or FALSE. Only those elements that return TRUE are modified by the function you provide to .f
purrr::map_if(.x = mydata, .p = is.data.frame,
.f = ~ dplyr::mutate(.x, col_to_modify = toupper(rank)))
#> $`Zostera marina`
#> name rank id col_to_modify
#> 1 Plantae kingdom 202422 KINGDOM
#> 2 Viridiplantae subkingdom 954898 SUBKINGDOM
#> 3 Streptophyta infrakingdom 846494 INFRAKINGDOM
#> 4 Embryophyta superdivision 954900 SUPERDIVISION
#> 5 Tracheophyta division 846496 DIVISION
#> 6 Spermatophytina subdivision 846504 SUBDIVISION
#> 7 Magnoliopsida class 18063 CLASS
#> 8 Lilianae superorder 846542 SUPERORDER
#> 9 Alismatales order 38883 ORDER
#> 10 Zosteraceae family 39069 FAMILY
#> 11 Zostera genus 39073 GENUS
#> 12 Zostera marina species 39074 SPECIES
#>
#> $`Vascular plants`
#> name rank id col_to_modify
#> 1 Plantae kingdom 202422 KINGDOM
#> 2 Viridiplantae subkingdom 954898 SUBKINGDOM
#> 3 Streptophyta infrakingdom 846494 INFRAKINGDOM
#> 4 Embryophyta superdivision 954900 SUPERDIVISION
#> 5 Tracheophyta division 846496 DIVISION
#>
#> $`Fucus vesiculosus`
#> name rank id col_to_modify
#> 1 Chromista kingdom 630578 KINGDOM
#> 2 Chromista subkingdom 590735 SUBKINGDOM
#> 3 Phaeophyta division 660055 DIVISION
#> 4 Phaeophyceae class 10686 CLASS
#> 5 Fucales order 11328 ORDER
#> 6 Fucaceae family 11329 FAMILY
#> 7 Fucus genus 11334 GENUS
#> 8 Fucus vesiculosus species 11335 SPECIES
#>
#> $Macroalgae
#> [1] NA
#>
#> $`Filamentous algae`
#> [1] NA