I want to replace specifically the numerical columns of one dataset, with the numerical columns of a corresponding transformed dataset. How can I do it (without using code specific to the particular dataset)?
e.g. toy example from mpg in library ggplot2:
mpg0 <- mpg
set.seed(123)
mpg0[sample(nrow(mpg),70,replace=FALSE),3] <- NA
mpg0[sample(nrow(mpg),70,replace=FALSE),8] <- NA
mpg0[sample(nrow(mpg),70,replace=FALSE),9] <- NA
sampled <- sample(nrow(mpg),50,replace=FALSE)
mpg_test <- mpg0[sampled,]
mpg_train <- mpg0[-sampled,]
mpg_mean <- mpg_train %>% group_by(cyl) %>% summarise_if(is.numeric,mean,na.rm=TRUE)
temp1 <- mpg_test %>% left_join(mpg_mean, by = 'cyl')
Now I would like to replace the missing values in the numeric columns of mpg_test (columns displ, cty, hwy--there are no NAs in the other numeric columns) with the values in the corresponding column from the left-join. I can do it with
temp1 <- as.data.frame(temp1)
temp1[c(3,8,9)][is.na(temp1[c(3,8,9)])] <- temp1[c(12,14,15)][is.na(temp[c(3,8,9)])]
But that is specific to this dataset. Problem with mutate_if is that I don't know what function to put in. Is there a good general way of doing this, i.e. mutating the numeric columns to get the means, replacing NA with values in the same row from the corresponding left-joined columns?
(dplyr only please)
You can do this by altering your left join and using case_when:
library(dplyr)
temp1 <- left_join(mpg_test, mpg_mean, by = "cyl")
temp1 %>%
mutate_if(is.integer, as.numeric) %>%
mutate(displ.x =
case_when(
is.na(displ.x) ~ displ.y,
TRUE ~ displ.x
),
cty.x =
case_when(
is.na(cty.x) ~ cty.y,
TRUE ~ cty.x
),
hwy.x =
case_when(
is.na(hwy.x) ~ hwy.y,
TRUE ~ hwy.x
)) %>%
select(-c(displ.y, year.y, cty.y, hwy.y)) %>%
rename(displ = displ.x,
year = year.x,
cty = cty.x,
hwy = hwy.x)
You can use coalesce :
library(dplyr)
mpg_test %>%
left_join(mpg_mean, by = 'cyl') %>%
mutate(displ = coalesce(displ.x, displ.y),
cty = coalesce(displ.x, displ.y),
hwy = coalesce(hwy.x, hwy.y)) %>%
select(-matches('\\.x|\\.y'))
Related
I have a dataframe like the following:
observations<- data.frame(X=c("00KS089001","00KS089001","00KS089002","00KS089002","00KS089003","00KS089003","00KS105001","00KS105001", "00KS177011","00KS177011","00P0006","00P006","00P006","00P006"), hzdept = c(0,20,0,15,0,13,0,20,0,16,0,6,13,29), hzdepb = c(20,30,15,30,13,30,20,30,16,30,6,13,29,30),Y=c("Red","White","Red","White","Green","Red","Red","Blue", "Black","Black","Red","White","White","White"), Z = c(0.67,0.33,0.5,0.5,0.43,0.57,0.67,0.33,0.53,0.47,0.2,0.23,0.53,0.04))
I want to be able to reduce this so that anytime X and Y are the same for two rows, the observations are combined i.e.
data.frame(X=c("00KS089001","00KS089001","00KS089002","00KS089002","00KS089003","00KS089003","00KS105001","00KS105001", "00KS177011","00P0006","00P006"), hzdept = c(0,20,0,15,0,13,0,20,0,0,6), hzdepb = c(20,30,15,30,13,30,20,30,30,6,30),Y=c("Red","White","Red","White","Green","Red","Red","Blue", "Black","Red","White"), Z = c(0.67,0.33,0.5,0.5,0.43,0.57,0.67,0.33,1.00,0.20,0.80))
Any suggestions on how to best go about this?
Edit: ok, now that I see how hzdept and hzdepb are supposed to be combined from your commment above:
library(tidyverse)
df <- observations %>% count(X,Y,wt = Z,name = "Z")
df_hzdept <- observations %>%
arrange(hzdept) %>%
distinct(X,Y,.keep_all = T) %>%
select(X,Y,hzdept)
df_hzdepb <- observations %>%
arrange(desc(hzdepb)) %>%
distinct(X,Y,.keep_all = T) %>%
select(X,Y,hzdepb)
df <- df %>% left_join(df_hzdept) %>% left_join(df_hzdepb)
Using dplyr
Here is how you would group by two columns and summarize using the minimum, max, and sum other columns in a dataframe:
library(magrittr) # For the pipe: %>%
observations %>%
dplyr::group_by(X, Y) %>%
dplyr::summarise(hzdept = min(hzdept),
hzdepb = max(hzdepb),
Z = sum(Z), .groups = 'drop')
I need to fit many loess splines by the grouping variable (Animal) across multiple numeric columns (Var1, Var2), and extract these values.
I found code to do this task one variable at a time;
# Create dataframe 1
OneVarDF <- data.frame(Day = c(replicate(1,sample(1:50,200,rep=TRUE))),
Animal = c(c(replicate(100,"Greyhound"), c(replicate(100,"Horse")))),
Var1 = c(c(replicate(1,sample(2:10,100,rep=TRUE))), c(replicate(1,sample(15:20,100,rep=TRUE)))))
library(dplyr)
library(tidyr)
library(purrr)
# Get fitted values from each model
Models <- OneVarDF %>%
tidyr::nest(-Animal) %>%
dplyr::mutate(m = purrr::map(data, loess, formula = Var1 ~ Day, span = 0.30),
fitted = purrr::map(m, `[[`, "fitted")
)
# Create prediction column
Results <- Models %>%
dplyr::select(-m) %>%
tidyr::unnest()
This "Results" dataframe is essential for downstream tasks (detrending many non-parametric distributions).
How can we achieve this with a dataframe with multiple numeric columns (code below), and extract a "Results" dataframe? Thank you.
# Create dataframe 2
TwoVarDF <- data.frame(Day = c(replicate(1,sample(1:50,200,rep=TRUE))),
Animal = c(c(replicate(100,"Greyhound"), c(replicate(100,"Horse")))),
Var1 = c(c(replicate(1,sample(2:10,100,rep=TRUE))), c(replicate(1,sample(15:20,100,rep=TRUE)))),
Var2 = c(c(replicate(1,sample(22:27,100,rep=TRUE))), c(replicate(1,sample(29:35,100,rep=TRUE)))))
We can get the data in long format using. pivot_longer, group_by Animal and column name and apply loess to each combinaton.
library(dplyr)
library(tidyr)
TwoVarDF %>%
pivot_longer(cols = starts_with('Var')) %>%
group_by(Animal, name) %>%
mutate(model = loess(value~Day, span = 0.3)$fitted)
Include a gather() function to proceed as similar to your previous code.
Models2 <- TwoVarDF %>%
gather(varName, varVal, 3:4) %>%
tidyr::nest(-Animal, -varName) %>%
dplyr::mutate(m = purrr::map(data, loess, formula = varVal ~ Day, span = 0.30),
fitted = purrr::map(m, `[[`, "fitted")
)
I would like to efficiently impute missing values with a slightly different value in each cell.
for example:
df <- data_frame(x = rnorm(100), y = rnorm(100))
df[1:5,1] <- NA
df[1:5, 2] <- NA
df %<>% mutate_all(funs(ifelse(is.na(.), jitter(median(., na.rm = TRUE)), .)))
However, this imputes with the same number in all cells.
How can I add a different noise to each cell?
Of course, I could do this with a loop, but my data frame is huge and I would like to do this efficiently
We can use rep with n()
library(dplyr)
library(magrittr)
df %<>%
mutate_all(list(~ case_when(is.na(.) ~ jitter(rep(median(., na.rm = TRUE), n())),
TRUE ~ .)))
I'm trying to replace the NAs in multiple column variables with randomly generated values from each student_id's subset row data:
data snapshot
so for student 3, systolic needs two NAs replaced. I used the min and max values for each variable within the student 3 subset to generate random values.
library(dplyr)
library(tidyr)
library(tibble)
library(tidyverse)
dplyr::filter(exercise, student_id == "3") %>% replace_na(list(systolic= round(sample(runif(1000, 125,130),2),0),
diastolic =round(sample(runif(1000, 85,85),3),0), heart_rate= round(sample(runif(1000, 79,86),2),0),
phys_score = round(sample(runif(1000, 8,9),2),0)
However it works only when one NA needs replacing: successfully replaced systolic NA values. When I try to replace more than one NAs, this error comes up.
Error: Replacement for `systolic` is length 2, not length 1
Is there a way to fix this? I tried converting the column variables to data frames instead of the vectors they are now, but it only returned the original data without any replacement changes.
Are there any simpler ways to this? Any suggestions/comments would be appreciated. Thanks.
A solution that makes things a little more automated but may be unnecessarily complex.
Generated some grouped missing data from the mtcars dataset
library(magrittr)
library(purrr)
library(dplyr)
library(stringr)
library(tidyr)
## Generate some missing data with a subset of car make
mtcars_miss <- mtcars %>%
as_tibble(rownames = "car") %>%
select(car) %>%
separate(car, c("make", "name"), " ") %>%
bind_cols(mtcars[, -1] %>%
map_df(~.[sample(c(TRUE, NA), prob = c(0.8, 0.2),
size = length(.), replace = TRUE)])) %>%
filter(make %in% c("Mazda", "Hornet", "Merc"))
Function to replace na values from a given variable by sampling within the min and max and depending on some group (here make).
replace_na_sample <- function(df_miss, var, group = "make") {
var <- enquo(var)
df_miss %>%
group_by(.dots = group) %>%
mutate(replace_var := round(runif(n(), min(!!var, na.rm = T),
max(!!var, na.rm = T)), 0)) %>%
rowwise %>%
mutate_at(.vars = vars(!!var),
.funs = funs(replace_na(., replace_var))) %>%
select(-replace_var) %>%
ungroup
}
Example replacing several missing values in multiple columns.
mtcars_replaced <- mtcars_miss %>%
replace_na_sample(cyl, group = "make") %>%
replace_na_sample(disp, group = "make") %>%
replace_na_sample(hp, group = "make")
I want to calculate rowsums only if colnames (i.e. species) of my data frame match two arguments in a second attribute table. This means it shoul first match the name in a column of the attributes table AND have a certain entry in another column of the attribute table.
However, the attribute table contains more species than the orginal data frame.
I tried :
# Species data from vegan package:
data(varespec, package = "vegan")
# create attributes table
attributes <- matrix(NA, length(varespec), 2)
attributes[,1] <- colnames(varespec)
attributes[,2] <- c(rep("MI",14),rep("PI",30))
# add species to the attribute table
x <- c("spec1","MI")
y <- c("spec2","PI")
attributes <- rbind(attributes, x, y)
row.names(attributes) <- c(1:46)
# calculate rowsums only for species contained in the attributes table
# and having the entry "MI" in the attributes table
for (i in 1:44){
for (j in 1:46){
if ((colnames(varespec)[i] == attributes[j,1]) & (attributes[j,2] == "MI")) {
apply(varespec,1,sum)
}
}}
But it always summed up the whole rows and not only the MI - species.
This is easy to solve if you convert the dataset into a long format
library(dplyr)
library(tidyr)
data(varespec, package = "vegan")
attributes <- data.frame(
Species = c(colnames(varespec), "spec1", "spec2"),
Attribute = c(rep(c("MI", "PI"), c(14, 30)), "MI", "PI")
)
varespec %>%
add_rownames("ID") %>%
gather(Species, Value, -ID) %>% #convert to long format
inner_join(attributes, by = "Species") %>%
filter(Attribute == "MI") %>%
group_by(ID) %>%
summarise(Total = sum(Value))