comparing the variables and their values between two data frames - r
I have two data frames with same kind of data, now i want to check for all the columns in both data frames have same kind of text in all columns in both data frames .
so for example the column name "sales executives" in both data frames have exact name "Micheal klay" in both data frames but if there is any spelling error or extra space i want to show it as not matching.
I have tried below approach and its working for small database but because my data is very big, data having approx 10 - 40 millions or records so its showing error
do we have any solution or any other approach to do that
cannot allocate vector of size 3.2GB
library(tidyverse)
df1 <- data.frame(MAN=c(6,6,4,6,8,6,8,4,4,6,6,8,8),MANi=c("OD","NY","CA","CA","OD","CA","OD","NY","OL","NY","OD","CA","OD"),
nune=c("akas","mani","juna","mau","nuh","kil","kman","nuha","huna","kman","nuha","huna","mani"),
klay=c(1,2,2,1,1,2,1,2,1,2,1,1,2),emial=c("dd","xyz","abc","dd","xyz","abc","dd","xyz","abc","dd","xyz","abc","dd"),Pass=c("Low","High","Low","Low","High","Low","High","High","Low","High","High","High","Low"),fri=c("KKK","USA","IND","SRI","PAK","CHI","JYP","TGA","KKK","USA","IND","SRI","PAK"),
mkl=c("m","f","m","m","f","m","m","f","m","m","f","m","m"),kin=c("Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Rec","Sent","Rec"),munc=c("Car","Bus","Truk","Cyl","Bus","Car","Bus","Bus","Bus","Car","Car","Cyl","Car"),
lone=c("Sr","jun","sr","jun","man","man","jr","Sr","jun","sr","jun","man","man"),wond=c("tko","kent","bho","kilt","kent","bho","kent","bho","bho","kilt","kent","bho","kilt"))
df2 <- data.frame(MAN=c(6,6,4,6,8,6,8,4,4,6,6,8,8,8,6),MANi=c("OD","NY","CA","CA","OD","CA","OD","NY","OL","ny","OD","CA","OD","NY","OL"),
nune=c("akas","mani","juna","mau","nuh","kil","kman","nuha","huna","kman","nuha","huna","mani","juna","mau"),
klay=c(1,2,2,1,1,2,1,2,1,2,1,1,2,2,1),emial=c("dd","xyz","ABC","dd","xyz","ABC","dd","xyz","ABC","dd","xyz","ABC","dd","xyz","ABC"),Pass=c("Low","High","Low","Low","High","Low","High","High","Low","High","High","High","Low","High","High"),fri=c("KKK","USA","IND","SRI","PAK","CHI","JYP","TGA","KKK","USA","IND","SRI","PAK","CHI","JYP"),
mkl=c("male","female","male","male","female","male","male","female","male","male","female","male","male","female","male"),kin=c("Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Rec","Sent","Rec","Sent","Rec"),munc=c("Car","Bus","Truk","Cyl","Bus","Car","Bus","Bus","Bus","Car","Car","Cyl","Car","Bus","Bus"),
lone=c("Sr","jun","sr","jun","man","man","jr","Sr","jun","sr","jun","man","man","jr","man"),wond=c("tko","kent","bho","kilt","kent","bho","kent","bho","bho","kilt","kent","bho","kilt","kent","bho"))
df1_long <- df1 %>%
as_tibble() %>%
mutate_if(is.double, as.character) %>% distinct() %>%
pivot_longer(everything(), names_to = "Names", values_to = "options") %>%
arrange(Names, options)
df2_long <- df2 %>%
as_tibble() %>%
mutate_if(is.double, as.character) %>% distinct() %>%
pivot_longer(everything(), names_to = "Names", values_to = "options") %>%
arrange(Names, options)
T1 <- df1_long %>%
full_join(df2_long, by=c("Names", "options"), keep = TRUE) %>%
distinct(Names.x, options.x, Names.y, options.y) %>%
arrange(Names.x, Names.y, options.x, options.y) %>%
mutate(
consistant_names = !is.na(Names.x) & !is.na(Names.y),
consistant_options = !is.na(options.x) & !is.na(options.y)
)
the output required like below
below are inconsistency between data bases
Related
Merging datasets in R results in incomplete string values
I wanted to merge two datasets and the merging resulted in getting incomplete strings in a variable # Librerias ```{r} library(dplyr) library(tidyr) library(tidyverse) library(readxl) library(readr) library(base) library(stringr) library(foreign) library(forcats) library(fs) library(hablar) library(openxlsx) ``` # Comercio CIIU ```{r} # Mercancias Catalogo_comercio_CIIU <- read_excel("Comercio/Catalogo comercio bienes CIIU.xlsx") %>% mutate(ACTIV4=CIIU) CIIU_Exportaciones <- read_excel("Comercio/CIIU Exportaciones.xlsx") %>% pivot_longer(cols = -...1, names_to = "Period", values_to = "Valor") %>% mutate(Flujo="Exportaciones") CIIU_Importaciones <- read_excel("Comercio/CIIU Importaciones.xlsx") %>% pivot_longer(cols = -...1, names_to = "Period", values_to = "Valor") %>% mutate(Flujo="Importaciones") Length<- CIIU_Exportaciones %>% group_by(...1) %>% summarise(Obs=n()) Length<- Length$Obs[[1]] %>% as.numeric() %>% as.vector() # Mensual Comercio_bienes_mensual <- rbind(CIIU_Importaciones, CIIU_Exportaciones) %>% rename(Actividad="...1") %>% group_by(Flujo, Actividad) %>% mutate(Fecha=seq(from=as.Date("1994-01-01"), by="month", length.out=Length)) %>% mutate(Year=str_sub(Fecha, 1L,4L), Mes=str_sub(Fecha, 6L,7L)) %>% group_by(Flujo, Actividad, Year) %>% mutate(Acumulado=cumsum(Valor)) %>% group_by(Flujo, Actividad) %>% mutate( C_acumulado= Acumulado-lag(Acumulado, n=12L), TC_acumulado=Acumulado/lag(Acumulado, n=12L)-1 ) %>% merge(Catalogo_comercio_CIIU, by="Actividad") %>% select(-Grupo,-Detalle,-Categoria1,-Categoria2) ``` This is the catalogue I want to merge This is how the merging turned out (incomplete string) I am a beginner so I don't really know what to try
Loop to create crosstabs of columns using tidyr
I would like to use a loop to create crosstabs of one column with every other column in a df. I started with this code (substituting in the iris df), which works nicely for two variables: iris <- iris tbl <- iris %>% tabyl(Species, Sepal.Length, show_missing_levels = FALSE, show_na = FALSE) %>% adorn_percentages("row") %>% adorn_pct_formatting(digits = 0) %>% adorn_ns() %>% adorn_title("combined") %>% knitr::kable() print(tbl) My df contains ~200 columns. I thought I would write a for loop to print a crosstab for one variable with each of the other variables. Here's what I tried: cols <- c('Sepal.Length', 'Sepal.Width') for (c in cols){ tbl <- iris %>% tabyl(Species, c, show_missing_levels = FALSE, show_na = FALSE) %>% adorn_percentages("row") %>% adorn_pct_formatting(digits = 0) %>% adorn_ns() %>% adorn_title("combined") %>% knitr::kable() print(tbl) } This returns Column `c` is not found. This seems like it should be simple, but I can't figure it out. Thanks for any help.
Change the c in your code to !!sym(c). I can't explain this non-standard tidyverse evaluation thingy, but in layman's terms, you want to access an object (i.e. "c") outside of your pipe (iris). That's why you need !!sym.
You can use the .data pronoun when passing columns names as strings. cols <- c('Sepal.Length', 'Sepal.Width') for (col in cols){ tbl <- iris %>% tabyl(Species, .data[[col]],show_missing_levels = FALSE,show_na = FALSE) %>% adorn_percentages("row") %>% adorn_pct_formatting(digits = 0) %>% adorn_ns() %>% adorn_title("combined") %>% knitr::kable() print(tbl) }
Wide Format Summary in tidyverse
Hi I have a dataframe in wide format that is grouped by Site. Each column represents the abundance a different species(85 total). I am trying to summarize the dataframe to calculate the total number of individuals regardless of species in my data. df.totals<- df %>% group_by(Site) %>% summarize (total = sum(6:91))
We can gather to 'long' format and then do the sum library(tidyverse) df %>% select(Site, 6:91) %>% rownames_to_column("rn") %>% gather(key, val, - Site, -rn) %>% group_by(Site, rn) %>% summarise(total = sum(val)) or another option without gathering would be df %>% select(Site, 6:91) %>% transmute(vs, Sum = reduce(.[2:ncol(.)], `+`)) %>% group_by(Site) %>% summarise(Sum = sum(Sum)) Using a reproducible example with mtcars mtcars %>% select(vs, 4:6) %>% transmute(vs, Sum = reduce(.[2:ncol(.)], `+`)) %>% group_by(vs) %>% summarise(Sum = sum(Sum))
Mutating values of subset of columns into percentage format
I have generated this summary table based on the df below. set.seed(1) df <- data.frame(rep( sample(c(2012,2016),10, replace = T)), sample(c('Treat','Control'),10,replace = T), runif(10,0,1), runif(10,0,1), runif(10,0,1)) colnames(df) <- c('Year','Group','V1','V2','V3') summary.table = df %>% group_by(Year, Group) %>% group_by(N = n(), add = TRUE) %>% summarise_all(funs(sd,median)) %>% ungroup %>% mutate(Year = ifelse(duplicated(Year),"",Year)) Is there a way I could display the values related to the median columns as percentages? I did not know how to use mutate() and scales::percent() for only a subset of columns (I dont want to do it individually, since there will be more columns in the original dataset, making this procedure not practical enough. What should I have done instead if I wanted to mutate according to a subset of rows? Thank you EDIT: And if it was like this? summary.table = df %>% group_by(Year, Group) %>% summarise_all(funs(median,sd)) %>% gather(key, value, -Year, -Group) %>% separate(key, into=c("var", "stat")) %>% unite(stat_Group, stat, Group) %>% spread(stat_Group, value) %>% ungroup %>% mutate(Year = ifelse(duplicated(Year),"",Year))
We need to use the percent wrapped on median summary.table <- df %>% group_by(Year, Group) %>% group_by(N = n(), add = TRUE) %>% summarise_all(funs(sd=sd(.),median=scales::percent(median(.)))) %>% ungroup %>% mutate(Year = ifelse(duplicated(Year),"",Year))
Combine list of data frames with one column of characters
I am learning to get, cleaning and combining data. I am confused why in a loop rbind command result in returning 10 data instead of expected 30 data as when I combine it manually (i by i). library(XML) mergeal <- NULL tabnums <- 3 for (i in 1:length(tabnums)) { bnn <- paste0("http://www.ngchanmau.com/listing_browse.php?cur_page=", tabnums[i], "&&coming=22-Oct-2015&coming=22-Oct-2015") tem <- readHTMLTable(bnn, header=T, stringsAsFactors=F) #data cleaning ff <- tem[8] #wanted data ff1 <- as.data.frame(ff) ff2 <- ff1[ , 1] #get 1st col data only ff3 <- unique(ff2) ff4 <- ff3[c(2,5:13)] #wanted list only #merging dataset mergeal <- rbind(mergeal, ff4) } I've tried using list rbind list of data frames with one column of characters and numerics but still have the same result as above. Appreciate any help on what I missed, thanks.
I cleaned up the data cause I was bored. library(plyr) library(XML) library(dplyr) library(magrittr) library(stringi) library(tidyr) library(lubridate) answer = data_frame(tabnums = 1:3) %>% group_by(tabnums) %>% do(.$tabnums %>% paste0("http://www.ngchanmau.com/listing_browse.php?cur_page=", ., "&&coming=22-Oct-2015&coming=22-Oct-2015") %>% readHTMLTable(header = T, stringsAsFactors = F) %>% extract2(8)) %>% ungroup %>% select(V1) %>% distinct %>% mutate(V1 = V1 %>% stri_replace_all_fixed("Â", "\n") %>% stri_replace_all_fixed("Type:", "\nType:") %>% stri_replace_all_fixed("Time:", "\nTime:") %>% stri_replace_all_fixed("Area:", "\nArea:") %>% stri_split_fixed("\n")) %>% unnest(V1) %>% mutate(V1 = V1 %>% stri_trim) %>% filter(V1 %>% stri_detect_regex("^There are currently") %>% `!`) %>% filter(V1 != "") %>% separate(V1, c("variable", "value"), sep = ":", fill = "left") %>% mutate(variable = variable %>% mapvalues(NA, "Description"), ID = variable %>% `==`("Description") %>% cumsum) %>% spread(variable, value) %>% mutate(Area = Area %>% extract_numeric, Price = Price %>% extract_numeric, Datetime = Time %>% stri_replace_all_fixed("a.m.", "am") %>% stri_replace_all_fixed("p.m.", "pm") %>% paste(Date, .) %>% dmy_hm) %>% select(-Date, -Time)