Merging datasets in R results in incomplete string values - r

I wanted to merge two datasets and the merging resulted in getting incomplete strings in a variable
# Librerias
```{r}
library(dplyr)
library(tidyr)
library(tidyverse)
library(readxl)
library(readr)
library(base)
library(stringr)
library(foreign)
library(forcats)
library(fs)
library(hablar)
library(openxlsx)
```
# Comercio CIIU
```{r}
# Mercancias
Catalogo_comercio_CIIU <- read_excel("Comercio/Catalogo comercio bienes CIIU.xlsx") %>%
mutate(ACTIV4=CIIU)
CIIU_Exportaciones <- read_excel("Comercio/CIIU Exportaciones.xlsx") %>%
pivot_longer(cols = -...1, names_to = "Period", values_to = "Valor") %>%
mutate(Flujo="Exportaciones")
CIIU_Importaciones <- read_excel("Comercio/CIIU Importaciones.xlsx") %>%
pivot_longer(cols = -...1, names_to = "Period", values_to = "Valor") %>%
mutate(Flujo="Importaciones")
Length<- CIIU_Exportaciones %>% group_by(...1) %>%
summarise(Obs=n())
Length<- Length$Obs[[1]] %>%
as.numeric() %>%
as.vector()
# Mensual
Comercio_bienes_mensual <- rbind(CIIU_Importaciones, CIIU_Exportaciones) %>%
rename(Actividad="...1") %>% group_by(Flujo, Actividad) %>%
mutate(Fecha=seq(from=as.Date("1994-01-01"), by="month", length.out=Length)) %>%
mutate(Year=str_sub(Fecha, 1L,4L), Mes=str_sub(Fecha, 6L,7L)) %>%
group_by(Flujo, Actividad, Year) %>%
mutate(Acumulado=cumsum(Valor)) %>%
group_by(Flujo, Actividad) %>%
mutate( C_acumulado= Acumulado-lag(Acumulado, n=12L), TC_acumulado=Acumulado/lag(Acumulado, n=12L)-1 ) %>%
merge(Catalogo_comercio_CIIU, by="Actividad") %>%
select(-Grupo,-Detalle,-Categoria1,-Categoria2)
```
This is the catalogue I want to merge
This is how the merging turned out (incomplete string)
I am a beginner so I don't really know what to try

Related

comparing the variables and their values between two data frames

I have two data frames with same kind of data, now i want to check for all the columns in both data frames have same kind of text in all columns in both data frames .
so for example the column name "sales executives" in both data frames have exact name "Micheal klay" in both data frames but if there is any spelling error or extra space i want to show it as not matching.
I have tried below approach and its working for small database but because my data is very big, data having approx 10 - 40 millions or records so its showing error
do we have any solution or any other approach to do that
cannot allocate vector of size 3.2GB
library(tidyverse)
df1 <- data.frame(MAN=c(6,6,4,6,8,6,8,4,4,6,6,8,8),MANi=c("OD","NY","CA","CA","OD","CA","OD","NY","OL","NY","OD","CA","OD"),
nune=c("akas","mani","juna","mau","nuh","kil","kman","nuha","huna","kman","nuha","huna","mani"),
klay=c(1,2,2,1,1,2,1,2,1,2,1,1,2),emial=c("dd","xyz","abc","dd","xyz","abc","dd","xyz","abc","dd","xyz","abc","dd"),Pass=c("Low","High","Low","Low","High","Low","High","High","Low","High","High","High","Low"),fri=c("KKK","USA","IND","SRI","PAK","CHI","JYP","TGA","KKK","USA","IND","SRI","PAK"),
mkl=c("m","f","m","m","f","m","m","f","m","m","f","m","m"),kin=c("Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Rec","Sent","Rec"),munc=c("Car","Bus","Truk","Cyl","Bus","Car","Bus","Bus","Bus","Car","Car","Cyl","Car"),
lone=c("Sr","jun","sr","jun","man","man","jr","Sr","jun","sr","jun","man","man"),wond=c("tko","kent","bho","kilt","kent","bho","kent","bho","bho","kilt","kent","bho","kilt"))
df2 <- data.frame(MAN=c(6,6,4,6,8,6,8,4,4,6,6,8,8,8,6),MANi=c("OD","NY","CA","CA","OD","CA","OD","NY","OL","ny","OD","CA","OD","NY","OL"),
nune=c("akas","mani","juna","mau","nuh","kil","kman","nuha","huna","kman","nuha","huna","mani","juna","mau"),
klay=c(1,2,2,1,1,2,1,2,1,2,1,1,2,2,1),emial=c("dd","xyz","ABC","dd","xyz","ABC","dd","xyz","ABC","dd","xyz","ABC","dd","xyz","ABC"),Pass=c("Low","High","Low","Low","High","Low","High","High","Low","High","High","High","Low","High","High"),fri=c("KKK","USA","IND","SRI","PAK","CHI","JYP","TGA","KKK","USA","IND","SRI","PAK","CHI","JYP"),
mkl=c("male","female","male","male","female","male","male","female","male","male","female","male","male","female","male"),kin=c("Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Sent","Rec","Rec","Sent","Rec","Sent","Rec"),munc=c("Car","Bus","Truk","Cyl","Bus","Car","Bus","Bus","Bus","Car","Car","Cyl","Car","Bus","Bus"),
lone=c("Sr","jun","sr","jun","man","man","jr","Sr","jun","sr","jun","man","man","jr","man"),wond=c("tko","kent","bho","kilt","kent","bho","kent","bho","bho","kilt","kent","bho","kilt","kent","bho"))
df1_long <- df1 %>%
as_tibble() %>%
mutate_if(is.double, as.character) %>% distinct() %>%
pivot_longer(everything(), names_to = "Names", values_to = "options") %>%
arrange(Names, options)
df2_long <- df2 %>%
as_tibble() %>%
mutate_if(is.double, as.character) %>% distinct() %>%
pivot_longer(everything(), names_to = "Names", values_to = "options") %>%
arrange(Names, options)
T1 <- df1_long %>%
full_join(df2_long, by=c("Names", "options"), keep = TRUE) %>%
distinct(Names.x, options.x, Names.y, options.y) %>%
arrange(Names.x, Names.y, options.x, options.y) %>%
mutate(
consistant_names = !is.na(Names.x) & !is.na(Names.y),
consistant_options = !is.na(options.x) & !is.na(options.y)
)
the output required like below
below are inconsistency between data bases

Loop to create crosstabs of columns using tidyr

I would like to use a loop to create crosstabs of one column with every other column in a df. I started with this code (substituting in the iris df), which works nicely for two variables:
iris <- iris
tbl <- iris %>%
tabyl(Species, Sepal.Length, show_missing_levels = FALSE, show_na = FALSE) %>%
adorn_percentages("row") %>%
adorn_pct_formatting(digits = 0) %>%
adorn_ns() %>%
adorn_title("combined") %>%
knitr::kable()
print(tbl)
My df contains ~200 columns. I thought I would write a for loop to print a crosstab for one variable with each of the other variables. Here's what I tried:
cols <- c('Sepal.Length', 'Sepal.Width')
for (c in cols){
tbl <- iris %>%
tabyl(Species, c, show_missing_levels = FALSE, show_na = FALSE) %>%
adorn_percentages("row") %>%
adorn_pct_formatting(digits = 0) %>%
adorn_ns() %>%
adorn_title("combined") %>%
knitr::kable()
print(tbl)
}
This returns Column `c` is not found.
This seems like it should be simple, but I can't figure it out. Thanks for any help.
Change the c in your code to !!sym(c). I can't explain this non-standard tidyverse evaluation thingy, but in layman's terms, you want to access an object (i.e. "c") outside of your pipe (iris). That's why you need !!sym.
You can use the .data pronoun when passing columns names as strings.
cols <- c('Sepal.Length', 'Sepal.Width')
for (col in cols){
tbl <- iris %>%
tabyl(Species, .data[[col]],show_missing_levels = FALSE,show_na = FALSE) %>%
adorn_percentages("row") %>%
adorn_pct_formatting(digits = 0) %>%
adorn_ns() %>%
adorn_title("combined") %>%
knitr::kable()
print(tbl)
}

add summary `n` from one dataframe to another dataframe (tidyverse)

I was wondering if there might be a way to replace the column fpc in DATA2 with corresponding fpc obtained from DATA1?
library(tidyverse)
dat <- read.csv('https://raw.githubusercontent.com/rnorouzian/d/master/su.csv')
## 10000 rows ################
DATA1 <- dat %>%
group_by(across(all_of(c("gender", "pre")))) %>%
summarise(n = n(), .groups = 'drop') %>%
mutate(fpc = n/sum(n)) %>%
right_join(dat)
dat2 <- read.csv('https://raw.githubusercontent.com/rnorouzian/d/master/out.csv')
## 200 rows #################
DATA2 <- dat2 %>%
group_by(across(all_of(c("gender", "pre")))) %>%
summarise(n = n(), .groups = 'drop') %>%
mutate(fpc = n/sum(n)) %>%
right_join(dat2)
You can join the dataframe and use coalesce to select fpc from DATA2.
library(dplyr)
result <- DATA2 %>%
left_join(DATA1 %>% distinct(gender, pre, fpc),
by = c('gender', 'pre')) %>%
mutate(fpc = coalesce(fpc.y, fpc.x)) %>%
select(names(DATA2))
nrow(result)
#[1] 200
It would be more efficient to do this in data.table
library(data.table)
setDT(DATA2)[as.data.table(unique(DATA1[c('gender', 'pre', 'fpc')])),
fpc := i.fpc, on = .(gender, pre)]

How do I parameterize the column names in dplyr function calls

Suppose we have the following setup
library(dplyr)
set.seed(10101)
id <- sample(3,20,replace = TRUE)
x <- sample(2,20,replace = TRUE)
df <- data.frame(id,x)
How do I parameterize the following:
df %>% group_by(id) %>% arrange(id) %>% mutate(x.lag=lag(x,1,default=0))
cl <- "x"
cl.lag <- "x.lag.1"
my naive attempt does not seem to work:
df %>% group_by(id) %>% arrange(id) %>% mutate(cl.lag=lag(cl,1,default=0))

Combine list of data frames with one column of characters

I am learning to get, cleaning and combining data. I am confused why in a loop rbind command result in returning 10 data instead of expected 30 data as when I combine it manually (i by i).
library(XML)
mergeal <- NULL
tabnums <- 3
for (i in 1:length(tabnums)) {
bnn <- paste0("http://www.ngchanmau.com/listing_browse.php?cur_page=",
tabnums[i], "&&coming=22-Oct-2015&coming=22-Oct-2015")
tem <- readHTMLTable(bnn, header=T, stringsAsFactors=F)
#data cleaning
ff <- tem[8] #wanted data
ff1 <- as.data.frame(ff)
ff2 <- ff1[ , 1] #get 1st col data only
ff3 <- unique(ff2)
ff4 <- ff3[c(2,5:13)] #wanted list only
#merging dataset
mergeal <- rbind(mergeal, ff4)
}
I've tried using list rbind list of data frames with one column of characters and numerics but still have the same result as above. Appreciate any help on what I missed, thanks.
I cleaned up the data cause I was bored.
library(plyr)
library(XML)
library(dplyr)
library(magrittr)
library(stringi)
library(tidyr)
library(lubridate)
answer =
data_frame(tabnums = 1:3) %>%
group_by(tabnums) %>%
do(.$tabnums %>%
paste0("http://www.ngchanmau.com/listing_browse.php?cur_page=",
., "&&coming=22-Oct-2015&coming=22-Oct-2015") %>%
readHTMLTable(header = T, stringsAsFactors = F) %>%
extract2(8)) %>%
ungroup %>%
select(V1) %>%
distinct %>%
mutate(V1 =
V1 %>%
stri_replace_all_fixed("Â", "\n") %>%
stri_replace_all_fixed("Type:", "\nType:") %>%
stri_replace_all_fixed("Time:", "\nTime:") %>%
stri_replace_all_fixed("Area:", "\nArea:") %>%
stri_split_fixed("\n")) %>%
unnest(V1) %>%
mutate(V1 = V1 %>% stri_trim) %>%
filter(V1 %>% stri_detect_regex("^There are currently") %>% `!`) %>%
filter(V1 != "") %>%
separate(V1, c("variable", "value"), sep = ":", fill = "left") %>%
mutate(variable = variable %>% mapvalues(NA, "Description"),
ID = variable %>% `==`("Description") %>% cumsum) %>%
spread(variable, value) %>%
mutate(Area = Area %>% extract_numeric,
Price = Price %>% extract_numeric,
Datetime =
Time %>%
stri_replace_all_fixed("a.m.", "am") %>%
stri_replace_all_fixed("p.m.", "pm") %>%
paste(Date, .) %>%
dmy_hm) %>%
select(-Date, -Time)

Resources