not getting all variables after merging three data sets - r

I am merging three data sets ( data frames) in R as follows:
Prv_mnth_so3 has state, Product_Number , Quantity_On_Hand , Category and Lc_Amount
Prv_mnth_soqty <- Prv_mnth_so3 %>%
filter( Category== "ESC") %>%
group_by (state,Product_Number) %>%
summarise(qty = sum(Quantity_On_Hand))
#arrange(state,Product_Number)
Prv_mnth_so_esc_amt <- Prv_mnth_so3 %>%
filter( Category== "ESC") %>%
group_by (state,Product_Number) %>%
summarise(esc = sum(as.numeric(Lc_Amount))) %>%
arrange(state,Product_Number)
Prv_mnth_so_lom_amt <- Prv_mnth_so3 %>%
filter( Category== "LOM") %>%
group_by (state,Product_Number) %>%
summarise(lom = sum(Lc_Amount))%>%
arrange(state,Product_Number)
Prv_mnth_si <- merge(Prv_mnth_soqty, Prv_mnth_so_esc_amt , Prv_mnth_so_lom_amt,
by.x = c("state","Product_Number") , by.y = c("state","Product_Number"), by.z = c("state","Product_Number"), all = TRUE) ```
in out come (Prv_mnth_si ) I expect 5 variables as - State, Product_number), qty, esc and lom but I am not gettig lom in outcome, though in Prv_mnth_so_lom_amt, I can see lom variables is there

Since you are using the tidyverse you can use one of the join functions provided by the dplyr package.
Change the last line from:
Prv_mnth_si <- merge(Prv_mnth_soqty,
Prv_mnth_so_esc_amt ,
Prv_mnth_so_lom_amt,
by.x = c("state","Product_Number") ,
by.y = c("state","Product_Number"),
by.z = c("state","Product_Number"),
all = TRUE)
to:
Prv_mnth_si <- full_join(Prv_mnth_soqty, Prv_mnth_so_esc_amt) %>%
full_join(Prvmnth_so_lom_amt)

Related

How to loop through a list of values and add it in the designated areas of the code within spatial parameters using sf?

This question builds off of my previous question that serves on its own.
I have a list c("String1","String2","String3")
Without the loop the code looks like this
Data_String1<- DF2 %>%
mutate(`New String`= if_else(lengths(st_intersects(DF2, DataFrame_String1, join = st_within))>0,DataFrame_String1$ID,"N"))%>%
filter(.,`New String`!="N")
Data_String2<- DF2 %>%
mutate(`New String`= if_else(lengths(st_intersects(DF2, DataFrame_String2, join = st_within))>0,DataFrame_String2$ID,"N"))%>%
filter(.,`New String`!="N")
Data_String3<- DF2 %>%
mutate(`New String`= if_else(lengths(st_intersects(DF2, DataFrame_String3, join = st_within))>0,DataFrame_String3$ID,"N"))%>%
filter(.,`New String`!="N")
Using the principles of the solution shown in the previous code I tried to implement here:
lst1 <- map(c("String1", "String2", "String3") ~DF2 %>%
mutate(`New String`= if_else(lengths(st_intersects(DF2, c("String1","String2","String3"), join = st_within)) > 0, .x, "N")) %>%
filter(.,`New String`!="N")
However I receive an error message like this
x no applicable method for 'st_geometry' applied to an object of class "character"` which makes me wonder if the dataframe loses being an spatial entity.
Simply extend from previous solution:
df_lst <- map(c("String1", "String2", "String3"), ~ DataFrame %>%
filter(ID == .x)
)
DF2_subs <- map(df_lst, ~ DF2 %>%
mutate(`New String`= if_else(
lengths(st_intersects(DF2, .x, join = st_within)) > 0,
.x$ID,
"N")) %>%
filter(`New String`!="N")
)
Otherwise pass in actual data frames and not vector of strings
DF2_subs <- map(list(Data_String1, Data_String2, Data_String3), ~ DF2 %>%
mutate(`New String`= if_else(
lengths(st_intersects(DF2, .x, join = st_within)) > 0,
.x$ID,
"N")) %>%
filter(`New String`!="N")
)

Is there a way to combine across() and mutate() if I am referencing column names from a list?

The dataset below has columns with very similar names and some values which are NA.
library(tidyverse)
dat <- data.frame(
v1_min = c(1,2,4,1,NA,4,2,2),
v1_max = c(1,NA,5,4,5,4,6,NA),
other_v1_min = c(1,1,NA,3,4,4,3,2),
other_v1_max = c(1,5,5,6,6,4,3,NA),
y1_min = c(3,NA,2,1,2,NA,1,2),
y1_max = c(6,2,5,6,2,5,3,3),
other_y1_min = c(2,3,NA,1,1,1,NA,2),
other_y1_max = c(5,6,4,2,NA,2,NA,NA)
)
head(dat)
In this example, x1 and y1 would be what I would consider the common "categories" among the columns. In order to get something similar with my current dataset, I had to use grepl to tease these out
cats<-dat %>%
names() %>%
gsub("^(.*)_(min|max)", "\\1",.) %>%
gsub("^(.*)_(.*)", "\\2",.) %>%
unique()
Now, my goal is to mutate a new min and a new max column for each of those categories. So far the code below works just fine.
dat %>%
rowwise() %>%
mutate(min_v1 = min(c_across(contains(cats[1])), na.rm=T)) %>%
mutate(max_v1 = max(c_across(contains(cats[1])), na.rm=T)) %>%
mutate(min_y1 = min(c_across(contains(cats[2])), na.rm=T)) %>%
mutate(max_y1 = max(c_across(contains(cats[2])), na.rm=T))
However, the number of categories in my current dataset is quite a bit bigger than 2.. Is there a way to implement this but quicker?
I've tried a few of the suggestions on this post but haven't quite been able to extend them to this problem.
You can use one of the map function here for each common categories.
library(dplyr)
library(purrr)
result <- bind_cols(dat, map_dfc(cats,
~dat %>%
rowwise() %>%
transmute(!!paste('min', .x, sep = '_') := min(c_across(matches(.x)), na.rm = TRUE),
!!paste('max', .x, sep = '_') := max(c_across(matches(.x)), na.rm = TRUE))))
result

Can I use purrr to execute a dplyr query and save the result of each query output

I have the following dataset:
combined <- data.frame(
client = c('aaa','aaa','aaa','bbb','bbb','ccc','ccc','ddd','ddd','ddd'),
type = c('norm','reg','opt','norm','norm','reg','opt','opt','opt','reg'),
age = c('>50','>50','75+','<25','<25','>50','75+','25-50','25-50','75+'),
cases = c('1','2','2','1','0','1','2','0','3','2'),
IsActive = c('1','0','0','1','1','0','1','1','1','0')
)
And have identified the unique variable combinations with :
# get unique variable combinations
unique_vars <- combined %>%
select(1:3,5) %>%
distinct()
I am trying to iterate on this query combined %>% anti_join(slice(unique_vars,1)) using purrr and save both the output of the query and also save summary of cases from each output back to the unique_vars table. The slice should iterate through each row of unique_vars, not be fixed at 1
I tried :
qry <- combined %>% anti_join(slice(unique_vars,1))
map(.x = unique_vars %>%
slice(.),
~qry %>%
summarise(CaseCnt = sum(cases)) %>%
inner_join(.x))
My desired output would be two things:
Full output of the query
the new Field CaseCnt added to the unique_vars dataframe
Is this possible?
Although I don't completely follow the intuition behind your query, it seems that for #1 you would want:
lapply(1:nrow(unique_vars), function(x) {
combined %>%
anti_join(slice(unique_vars, x), keep = TRUE)
})
And for #2 you would want:
unique_vars$CaseCnt <- lapply(1:nrow(unique_vars), function(x) {
combined %>%
anti_join(slice(unique_vars, x), keep = TRUE) %>%
summarise(CaseCnt = sum(cases %>% as.numeric))
}) %>% do.call(what = rbind.data.frame,
args = .)
Alternatively for #2 with purrr:map_df():
unique_vars$CaseCnt <- map_df(c(1:nrow(unique_vars)), function(x) {
combined %>%
anti_join(slice(unique_vars, x), keep = TRUE) %>%
summarise(CaseCnt = sum(cases %>% as.numeric))
})
Just as an aside -- you could do this directly with:
combined %>%
mutate(cases = as.numeric(cases)) %>%
mutate(tot_cases = sum(cases)) %>% # sum total cases across unique_id's
group_by(client, type, age, IsActive) %>%
summarize(CaseCnt = mean(tot_cases) - sum(cases))
Or if what you were actually looking for is the sum of cases in that group:
combined %>%
mutate(cases = as.numeric(cases)) %>%
group_by(client, type, age, IsActive) %>%
summarize(CaseCnt = sum(cases))

Unnest and concatenate values in r

I am trying to unnest two columns that do not always have the same number of values per cell and then concatenate the values that have a correspond between the two columns. For example:
library('dplyr')
library('tidyr')
#Sample Data
df <- data.frame(id = c(1:4),
first.names = c('Michael, Jim', 'Michael, Michael', 'Creed', 'Creed, Jim'),
last.names = c('Scott, Halpert', 'Scott, Cera', '', 'Halpert'))
Not all values in df$first.names are associated with a value in df$last.names. I am trying to get the following results:
#Desired output
df.results <- data.frame(id = c(1,1,2,2,3,4,4),
first.names = c('Michael', 'Jim', 'Michael', 'Michael', 'Creed', 'Creed', 'Jim'),
last.names = c('Scott', 'Halpert', 'Scott', 'Cera', '', '', 'Halpert'),
full.names = c('Michael Scott', 'Jim Halpert', 'Michael Scott', 'Michael Cera', 'Creed', 'Creed', 'Jim Halpert'))
I have tried using unnest, it works for first.names, but not for last.names (it drops the row where last.names is blank):
#convert to characters
df$first.names <- as.character(df$first.names)
df$last.names <- as.character(df$last.names)
#Unnest first names
df <- df %>%
transform(first.names = strsplit(first.names, ',')) %>%
unnest(first.names)%>%
transform(last.names = strsplit(last.names, ',')) %>%
unnest(last.names)
I was then going to delete duplicate lines, but that still does not solve the the issues with the values in df$first.names that do not have a value in df$last.names
Is there a better way to do this?
Check this solution:
library(tidyverse)
df %>%
as_tibble() %>%
mutate_at(2:3, ~ strsplit(as.character(.x), ',') %>% map(~ str_trim(.x))) %>%
mutate(
First = map2_chr(first.names, last.names, ~ paste(.x[1], .y[1])),
Second = map2_chr(first.names, last.names, ~ paste(.x[2], .y[2]))
) %>%
mutate_at(4:5, ~ str_remove_all(.x, 'NA') %>% str_trim()) %>%
gather('x', 'full.names', First:Second) %>%
filter(full.names != '') %>%
mutate(
first.names = map_chr(full.names, ~ str_split(.x, ' ')[[1]][1]),
last.names = map_chr(full.names, ~ str_split(.x, ' ')[[1]][2]) %>%
replace_na('')
) %>%
select(-x) %>%
arrange(id)
I can include a logic, that if there is one last.names it will combine it with the second first.names to get the same result, but I don't think this is what you want. Vector with first.names that has no second.names can solve the problem.

Join across/within nested dataframes

I am pulling information out of a model for eventual plotting. My desired plots are jittered original data with an overlay of mean +/- STDERR and text groupings. The model outputs put the groupings and estimates in separate dataframes within a list. I'm using map to extract those and it works, but I'm stuck with the step of joining them together.
I want to join two nested list-cols into a single table and nest that result as a new column. Best I can do currently is to unnest, join tables, nest again, and join back to original nested table.
library(agricolae)
library(tidyverse)
fitHSD2<- function(d) HSD.test(aov(mpg ~ cyl, data= d), trt = "cyl") # anova with Tukey HSD
carnestdf <-
mtcars %>%
group_by(gear) %>%
nest() %>%
mutate(mod = map(data, fitHSD2) # fit model
, estimates = map(mod, function(df) return(df$means)) # pull out estimates and StdErr
, estimates = map(estimates, function(df) return(rownames_to_column(df, var = "trt"))) #attach rownames as column for unnest
, grouping = map(mod, function(df) return(df$groups)) # pull out groupings
, grouping = map(grouping, function(df) mutate(df, trt = as.character(trt) # convert to character
, trt = gsub("[[:space:]]*$", "", trt)
, M = as.character(M)
)
) # remove whitespace at end for join
)
carnestdf
I can unnest each one and join them, but I can't nest and join them back. I can in fact... just need to define the join key otherwise it tries to join based upon the nested DF and that doesn't work without the hashing below.
full_join(unnest(carnestdf , estimates), unnest(carnestdf , grouping)) %>%
group_by(gear) %>%
nest(.key = "estgrp") %>%
full_join(carnestdf, ., by = "gear")
I found this: R: Join two tables (tibbles) by *list* columns
But it doesn't seem to work, I get the same error when using the hash to join. It does work, needed to define the .key in nest so it wasn't "data". Would still prefer to join without unnesting... :/
nestmerge <-
full_join(unnest(carnestdf , estimates), unnest(carnestdf , grouping)) %>%
group_by(gear) %>%
nest(.key = "mergedestgrp") %>%
mutate_all(funs(hash = map_chr(., digest::digest)))
carnestdf %>%
mutate_all(funs(hash = map_chr(., digest::digest))) %>%
full_join(., nestmerge) %>%
select(-ends_with("hash"))
The answer apparently is map2:
carnestdf <-
mtcars %>%
group_by(gear) %>%
nest() %>%
mutate(mod = map(data, fitHSD2) # fit model
, estimates = map(mod, function(df) return(df$means)) # pull out estimates and StdErr
, estimates = map(estimates, function(df) return(rownames_to_column(df, var = "trt"))) #attach rownames as column for unnest
, grouping = map(mod, function(df) return(df$groups)) # pull out groupings
, grouping = map(grouping, function(df) mutate(df, trt = as.character(trt) # convert to character
, trt = gsub("[[:space:]]*$", "", trt)
, M = as.character(M)
)
) # remove whitespace at end for join
, estgrp = map2(estimates, grouping, ~full_join(.x, .y, by = "trt"))
)
carnestdf
This does a full join on the two tables by "trt" and makes a new list column with the result.

Resources