comparing the variables and their values between two data frames

comparing the variables and their values between two data frames - r

I have two data frames with same kind of data, now i want to check for all the columns in both data frames have same kind of text in all columns in both data frames .
so for example the column name "sales executives" in both data frames have exact name "Micheal klay" in both data frames but if there is any spelling error or extra space i want to show it as not matching.
I have tried below approach and its working for small database but because my data is very big, data having approx 10 - 40 millions or records so its showing error
do we have any solution or any other approach to do that
cannot allocate vector of size 3.2GB
library(tidyverse)
df1 <- data.frame(MAN=c(6,6,4,6,8,6,8,4,4,6,6,8,8),MANi=c("OD","NY","CA","CA","OD","CA","OD","NY","OL","NY","OD","CA","OD"),
nune=c("akas","mani","juna","mau","nuh","kil","kman","nuha","huna","kman","nuha","huna","mani"),
klay=c(1,2,2,1,1,2,1,2,1,2,1,1,2),emial=c("dd","xyz","abc","dd","xyz","abc","dd","xyz","abc","dd","xyz","abc","dd"),Pass=c("Low","High","Low","Low","High","Low","High","High","Low","High","High","High","Low"),fri=c("KKK","USA","IND","SRI","PAK","CHI","JYP","TGA","KKK","USA","IND","SRI","PAK"),
mkl=c("m","f","m","m","f","m","m","f","m","m","f","m","m"),kin=c("Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Rec","Sent","Rec"),munc=c("Car","Bus","Truk","Cyl","Bus","Car","Bus","Bus","Bus","Car","Car","Cyl","Car"),
lone=c("Sr","jun","sr","jun","man","man","jr","Sr","jun","sr","jun","man","man"),wond=c("tko","kent","bho","kilt","kent","bho","kent","bho","bho","kilt","kent","bho","kilt"))
df2 <- data.frame(MAN=c(6,6,4,6,8,6,8,4,4,6,6,8,8,8,6),MANi=c("OD","NY","CA","CA","OD","CA","OD","NY","OL","ny","OD","CA","OD","NY","OL"),
nune=c("akas","mani","juna","mau","nuh","kil","kman","nuha","huna","kman","nuha","huna","mani","juna","mau"),
klay=c(1,2,2,1,1,2,1,2,1,2,1,1,2,2,1),emial=c("dd","xyz","ABC","dd","xyz","ABC","dd","xyz","ABC","dd","xyz","ABC","dd","xyz","ABC"),Pass=c("Low","High","Low","Low","High","Low","High","High","Low","High","High","High","Low","High","High"),fri=c("KKK","USA","IND","SRI","PAK","CHI","JYP","TGA","KKK","USA","IND","SRI","PAK","CHI","JYP"),
mkl=c("male","female","male","male","female","male","male","female","male","male","female","male","male","female","male"),kin=c("Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Rec","Sent","Rec","Sent","Rec"),munc=c("Car","Bus","Truk","Cyl","Bus","Car","Bus","Bus","Bus","Car","Car","Cyl","Car","Bus","Bus"),
lone=c("Sr","jun","sr","jun","man","man","jr","Sr","jun","sr","jun","man","man","jr","man"),wond=c("tko","kent","bho","kilt","kent","bho","kent","bho","bho","kilt","kent","bho","kilt","kent","bho"))
df1_long <- df1 %>%
as_tibble() %>%
mutate_if(is.double, as.character) %>% distinct() %>%
pivot_longer(everything(), names_to = "Names", values_to = "options") %>%
arrange(Names, options)
df2_long <- df2 %>%
as_tibble() %>%
mutate_if(is.double, as.character) %>% distinct() %>%
pivot_longer(everything(), names_to = "Names", values_to = "options") %>%
arrange(Names, options)
T1 <- df1_long %>%
full_join(df2_long, by=c("Names", "options"), keep = TRUE) %>%
distinct(Names.x, options.x, Names.y, options.y) %>%
arrange(Names.x, Names.y, options.x, options.y) %>%
mutate(
consistant_names = !is.na(Names.x) & !is.na(Names.y),
consistant_options = !is.na(options.x) & !is.na(options.y)
)
the output required like below
below are inconsistency between data bases

Related

Merging datasets in R results in incomplete string values

I wanted to merge two datasets and the merging resulted in getting incomplete strings in a variable
# Librerias
```{r}
library(dplyr)
library(tidyr)
library(tidyverse)
library(readxl)
library(readr)
library(base)
library(stringr)
library(foreign)
library(forcats)
library(fs)
library(hablar)
library(openxlsx)
```
# Comercio CIIU
```{r}
# Mercancias
Catalogo_comercio_CIIU <- read_excel("Comercio/Catalogo comercio bienes CIIU.xlsx") %>%
mutate(ACTIV4=CIIU)
CIIU_Exportaciones <- read_excel("Comercio/CIIU Exportaciones.xlsx") %>%
pivot_longer(cols = -...1, names_to = "Period", values_to = "Valor") %>%
mutate(Flujo="Exportaciones")
CIIU_Importaciones <- read_excel("Comercio/CIIU Importaciones.xlsx") %>%
pivot_longer(cols = -...1, names_to = "Period", values_to = "Valor") %>%
mutate(Flujo="Importaciones")
Length<- CIIU_Exportaciones %>% group_by(...1) %>%
summarise(Obs=n())
Length<- Length$Obs[[1]] %>%
as.numeric() %>%
as.vector()
# Mensual
Comercio_bienes_mensual <- rbind(CIIU_Importaciones, CIIU_Exportaciones) %>%
rename(Actividad="...1") %>% group_by(Flujo, Actividad) %>%
mutate(Fecha=seq(from=as.Date("1994-01-01"), by="month", length.out=Length)) %>%
mutate(Year=str_sub(Fecha, 1L,4L), Mes=str_sub(Fecha, 6L,7L)) %>%
group_by(Flujo, Actividad, Year) %>%
mutate(Acumulado=cumsum(Valor)) %>%
group_by(Flujo, Actividad) %>%
mutate( C_acumulado= Acumulado-lag(Acumulado, n=12L), TC_acumulado=Acumulado/lag(Acumulado, n=12L)-1 ) %>%
merge(Catalogo_comercio_CIIU, by="Actividad") %>%
select(-Grupo,-Detalle,-Categoria1,-Categoria2)
```
This is the catalogue I want to merge
This is how the merging turned out (incomplete string)
I am a beginner so I don't really know what to try

Loop to create crosstabs of columns using tidyr

I would like to use a loop to create crosstabs of one column with every other column in a df. I started with this code (substituting in the iris df), which works nicely for two variables:
iris <- iris
tbl <- iris %>%
tabyl(Species, Sepal.Length, show_missing_levels = FALSE, show_na = FALSE) %>%
adorn_percentages("row") %>%
adorn_pct_formatting(digits = 0) %>%
adorn_ns() %>%
adorn_title("combined") %>%
knitr::kable()
print(tbl)
My df contains ~200 columns. I thought I would write a for loop to print a crosstab for one variable with each of the other variables. Here's what I tried:
cols <- c('Sepal.Length', 'Sepal.Width')
for (c in cols){
tbl <- iris %>%
tabyl(Species, c, show_missing_levels = FALSE, show_na = FALSE) %>%
adorn_percentages("row") %>%
adorn_pct_formatting(digits = 0) %>%
adorn_ns() %>%
adorn_title("combined") %>%
knitr::kable()
print(tbl)
}
This returns Column `c` is not found.
This seems like it should be simple, but I can't figure it out. Thanks for any help.

Change the c in your code to !!sym(c). I can't explain this non-standard tidyverse evaluation thingy, but in layman's terms, you want to access an object (i.e. "c") outside of your pipe (iris). That's why you need !!sym.

You can use the .data pronoun when passing columns names as strings.
cols <- c('Sepal.Length', 'Sepal.Width')
for (col in cols){
tbl <- iris %>%
tabyl(Species, .data[[col]],show_missing_levels = FALSE,show_na = FALSE) %>%
adorn_percentages("row") %>%
adorn_pct_formatting(digits = 0) %>%
adorn_ns() %>%
adorn_title("combined") %>%
knitr::kable()
print(tbl)
}

Wide Format Summary in tidyverse

Hi I have a dataframe in wide format that is grouped by Site. Each column represents the abundance a different species(85 total). I am trying to summarize the dataframe to calculate the total number of individuals regardless of species in my data.
df.totals<- df %>% group_by(Site) %>% summarize (total = sum(6:91))

We can gather to 'long' format and then do the sum
library(tidyverse)
df %>%
select(Site, 6:91) %>%
rownames_to_column("rn") %>%
gather(key, val, - Site, -rn) %>%
group_by(Site, rn) %>%
summarise(total = sum(val))
or another option without gathering would be
df %>%
select(Site, 6:91) %>%
transmute(vs, Sum = reduce(.[2:ncol(.)], `+`)) %>%
group_by(Site) %>%
summarise(Sum = sum(Sum))
Using a reproducible example with mtcars
mtcars %>%
select(vs, 4:6) %>%
transmute(vs, Sum = reduce(.[2:ncol(.)], `+`)) %>%
group_by(vs) %>%
summarise(Sum = sum(Sum))

Mutating values of subset of columns into percentage format

I have generated this summary table based on the df below.
set.seed(1)
df <- data.frame(rep(
sample(c(2012,2016),10, replace = T)),
sample(c('Treat','Control'),10,replace = T),
runif(10,0,1),
runif(10,0,1),
runif(10,0,1))
colnames(df) <- c('Year','Group','V1','V2','V3')
summary.table = df %>%
group_by(Year, Group) %>%
group_by(N = n(), add = TRUE) %>%
summarise_all(funs(sd,median)) %>%
ungroup %>%
mutate(Year = ifelse(duplicated(Year),"",Year))
Is there a way I could display the values related to the median columns as percentages?
I did not know how to use mutate() and scales::percent() for only a subset of columns (I dont want to do it individually, since there will be more columns in the original dataset, making this procedure not practical enough.
What should I have done instead if I wanted to mutate according to a subset of rows?
Thank you
EDIT:
And if it was like this?
summary.table = df %>%
group_by(Year, Group) %>%
summarise_all(funs(median,sd)) %>%
gather(key, value, -Year, -Group) %>%
separate(key, into=c("var", "stat")) %>%
unite(stat_Group, stat, Group) %>%
spread(stat_Group, value) %>%
ungroup %>%
mutate(Year = ifelse(duplicated(Year),"",Year))

We need to use the percent wrapped on median
summary.table <- df %>%
group_by(Year, Group) %>%
group_by(N = n(), add = TRUE) %>%
summarise_all(funs(sd=sd(.),median=scales::percent(median(.)))) %>%
ungroup %>%
mutate(Year = ifelse(duplicated(Year),"",Year))

Combine list of data frames with one column of characters

I am learning to get, cleaning and combining data. I am confused why in a loop rbind command result in returning 10 data instead of expected 30 data as when I combine it manually (i by i).
library(XML)
mergeal <- NULL
tabnums <- 3
for (i in 1:length(tabnums)) {
bnn <- paste0("http://www.ngchanmau.com/listing_browse.php?cur_page=",
tabnums[i], "&&coming=22-Oct-2015&coming=22-Oct-2015")
tem <- readHTMLTable(bnn, header=T, stringsAsFactors=F)
#data cleaning
ff <- tem[8] #wanted data
ff1 <- as.data.frame(ff)
ff2 <- ff1[ , 1] #get 1st col data only
ff3 <- unique(ff2)
ff4 <- ff3[c(2,5:13)] #wanted list only
#merging dataset
mergeal <- rbind(mergeal, ff4)
}
I've tried using list rbind list of data frames with one column of characters and numerics but still have the same result as above. Appreciate any help on what I missed, thanks.

I cleaned up the data cause I was bored.
library(plyr)
library(XML)
library(dplyr)
library(magrittr)
library(stringi)
library(tidyr)
library(lubridate)
answer =
data_frame(tabnums = 1:3) %>%
group_by(tabnums) %>%
do(.$tabnums %>%
paste0("http://www.ngchanmau.com/listing_browse.php?cur_page=",
., "&&coming=22-Oct-2015&coming=22-Oct-2015") %>%
readHTMLTable(header = T, stringsAsFactors = F) %>%
extract2(8)) %>%
ungroup %>%
select(V1) %>%
distinct %>%
mutate(V1 =
V1 %>%
stri_replace_all_fixed("Â", "\n") %>%
stri_replace_all_fixed("Type:", "\nType:") %>%
stri_replace_all_fixed("Time:", "\nTime:") %>%
stri_replace_all_fixed("Area:", "\nArea:") %>%
stri_split_fixed("\n")) %>%
unnest(V1) %>%
mutate(V1 = V1 %>% stri_trim) %>%
filter(V1 %>% stri_detect_regex("^There are currently") %>% `!`) %>%
filter(V1 != "") %>%
separate(V1, c("variable", "value"), sep = ":", fill = "left") %>%
mutate(variable = variable %>% mapvalues(NA, "Description"),
ID = variable %>% `==`("Description") %>% cumsum) %>%
spread(variable, value) %>%
mutate(Area = Area %>% extract_numeric,
Price = Price %>% extract_numeric,
Datetime =
Time %>%
stri_replace_all_fixed("a.m.", "am") %>%
stri_replace_all_fixed("p.m.", "pm") %>%
paste(Date, .) %>%
dmy_hm) %>%
select(-Date, -Time)

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

comparing the variables and their values between two data frames - r

Related

Merging datasets in R results in incomplete string values

Loop to create crosstabs of columns using tidyr

Wide Format Summary in tidyverse

Mutating values of subset of columns into percentage format

Combine list of data frames with one column of characters

Categories

Resources