Recoding values in a selection of columns of a dataframe using dplyr - r

I have a large dataset like the one in the next example. Columns with As in their headers have codes from 1 to 4, and columns with Bs from 1 to 3.
library(dplyr)
d <- data.frame(
ID = 1:10,
A = sample(x = 1:4, size = 10, replace = T),
AA = sample(x = 1:4, size = 10, replace = T),
B = sample(x = 1:3, size = 10, replace = T),
BB = sample(x = 1:3, size = 10, replace = T)
)
Is there a neat way to use pipes in dplyr to recode the values from columns with As in the headers and columns with Bs to the following strings?
As - from 1, 2, 3, 4 to Green, Yellow, Orange, Red respectively
Bs - from 1, 2, 3 to Green, Yellow, Red respectively
This is a simplified and friendlier version of the real dataset.

By using mutate_at from dplyr, it is possible to accomplish the recodification from numeric codes to strings. It is necessary to first coerce the columns we want to recode from numeric to character or, otherwise, there will be an error message.
library(dplyr)
d <- data.frame(
ID = 1:10,
A = sample(x = 1:4, size = 10, replace = T),
AA = sample(x = 1:4, size = 10, replace = T),
B = sample(x = 1:3, size = 10, replace = T),
BB = sample(x = 1:3, size = 10, replace = T))
d_recoded <- d %>% mutate_at(vars(-contains("ID")), funs(as.character)) %>%
mutate_at(vars(contains("A"), -contains("ID")), funs(case_when(. == 1 ~ "Green", . == 2 ~ "Yellow", . == 3 ~ "Orange", . == 4 ~ "Red"))) %>%
mutate_at(vars(contains("B"), -contains("ID")), funs(case_when(. == 1 ~ "Green", . == 2 ~ "Yellow", . == 3 ~ "Red")))

Related

I want a function in R to check whether values of one column are greater than the 75th quantile then writes yes or no in the next column

I have tried the following formula but it gives all nos even when I change the quantile value.
NOTE: I have 3 independent datasets that I want to apply the function.
outlier<-function(x1,x2){
q1<-quantile(x1 , .75, na.rm = TRUE)
if(x1>q1){x2<-"Yes"
}else{
x2<-"No"
}
}
I have tried x2<-ifelse(x1>q1,"Yes","No")
inside the function but it still doesn't work.
You can use an ifelse statement and create a new column using mutate.
library(dplyr)
set.seed(1)
df <- tibble(x1 = sample(c(1:10), size = 10, replace = T))
df %>%
mutate(x2 = ifelse(quantile(x1, 0.75, na.rm = T) < x1, "Yes", "No"))
If you want a function
library(dplyr)
set.seed(1)
df <- tibble(x1 = sample(c(1:10), size = 10, replace = T),
x2 = sample(c(1:10), size = 10, replace = T),
x3 = sample(c(1:10), size = 10, replace = T),
x4 = sample(c(1:10), size = 10, replace = T))
outlier<-function(dataframe, quant = 0.75, col = c("x1", "x2")){
dataframe %>%
mutate(across(all_of(col), ~ifelse(.x>quantile(.x,0.75), 'Yes', 'No'),
.names = '{col}_yes'))
}
outlier(dataframe = df,quant = 0.25)

Selecting elements from a list with non compatible length

Given the following structure of the list:
x <- list(list(Main = list(one = list(tlv = 1, beta = 2), two = "three", three = 4,list_a = list(list(value_1 = "a1", value_2 = "b", c = "c")))),
list(Main = list(one = list(tlv = 2, beta = 6), two = "seven", three = 8,list_a = list(list(value_1 = "aa2", value_2 = "bb", c = "cc")))),
list(Main = list(one = list(tlv = 3),list_a = list(list(value_1 = c("aaa3", "aaaa4"), value_2 = c("bbb", "bbbb"), c = c("ccc", "ccc"))))))
I'm trying to create a dataframe with a structure like this:
tlv | value_1
1 | a1
2 | aa2
3 | aaa3
3 | aaaa4
so far I have to the following:
library(tidyverse)
tibble::tibble(
tlv = map(x, list(1,1,"tlv"), .default = NA) %>% unlist(),
value = map(x, list(1,"list_a", 1, "value"), .default = NA) %>% unlist())
Which leads to the following error:
Error: Tibble columns must have compatible sizes.
* Size 3: Existing data.
* Size 4: Column `value`.
i Only values of size one are recycled.
This makes sense given the structure of the list (3 values for one of the variables en 4 values for the other). But I don't see a solution to link the values to the parent element of the list. So that every 'value' also gets the corresponding 'tlv' value. Any guidance how to solve this problem?
Found a solution, this does the trick:
x %>%
map_df(~tibble(
tlv = .$Main$one$tlv,
value = .$Main$list_a[[1]]$value_1))
An alternative :
library(tidyverse)
value_1 <-
map_depth(x, 4, pluck, "value_1", .ragged = TRUE) %>%
map(unlist, use.names = FALSE)
tlv <-
map_depth(x, 3, pluck, "tlv") %>%
map_dbl(unlist, use.names = FALSE)
df <-
tibble(tlv = tlv, value_1 = value_1) %>%
unnest_auto(col = value_1)

chi square over multiple groups and variables

I have a huge dataset with several groups (factors with between 2 to 6 levels), and dichotomous variables (0, 1).
example data
DF <- data.frame(
group1 = sample(x = c("A","B","C","D"), size = 100, replace = T),
group2 = sample(x = c("red","blue","green"), size = 100, replace = T),
group3 = sample(x = c("tiny","small","big","huge"), size = 100, replace = T),
var1 = sample(x = 0:1, size = 100, replace = T),
var2 = sample(x = 0:1, size = 100, replace = T),
var3 = sample(x = 0:1, size = 100, replace = T),
var4 = sample(x = 0:1, size = 100, replace = T),
var5 = sample(x = 0:1, size = 100, replace = T))
I want to do a chi square for every group, across all the variables.
library(tidyverse)
library(rstatix)
chisq_test(DF$group1, DF$var1)
chisq_test(DF$group1, DF$var2)
chisq_test(DF$group1, DF$var3)
...
etc
I managed to make it work by using two nested for loops, but I'm sure there is a better solution
groups <- c("group1","group2","group3")
vars <- c("var1","var2","var3","var4","var5")
results <- data.frame()
for(i in groups){
for(j in vars){
test <- chisq_test(DF[,i], DF[,j])
test <- mutate(test, group=i, var=j)
results <- rbind(results, test)
}
}
results
I think I need some kind of apply function, but I can't figure it out
Here is one way to do it with apply. I am sure there is an even more elegant way to do it with dplyr. (Note that here I extract the p.value of the test, but you can extract something else or the whole test result if you prefer).
res <- apply(DF[,1:3], 2, function(x) {
apply(DF[,4:7], 2,
function(y) {chisq.test(x,y)$p.value})
})
Here's a quick and easy dplyr solution, that involves transforming the data into long format keyed by group and var, then running the chi-sq test on each combination of group and var.
DF %>%
pivot_longer(starts_with("group"), names_to = "group", values_to = "group_val") %>%
pivot_longer(starts_with("var"), names_to = "var", values_to = "var_val") %>%
group_by(group, var) %>%
summarise(chisq_test(group_val, var_val)) %>%
ungroup()

How to add an offset to mixed model

I have a data set containing the step count of cows from a 4 week trial where each animal was exposed to treatment A or treatment B at the beginning of week 2, and want to know how the step rate of the two treatment groups changed each week compared to week 1.
How do I add an offset to my model to do this?
The model I am running before adding the offset is this:
mod.1 <- glmmTMB(Step.count ~ Week*Treatment + (1|Cow.ID), data = data.df, family = poisson)
Here is an example of my data
data.1 <- data.frame(Cow.ID = rep(1, 20),
Week = sample(c(1,2,3,4), 20, replace = TRUE),
Treatment = sample(c("infected"), 20, replace = TRUE),
Step.count = rpois(20, 60.1))
data.2 <- data.frame(Cow.ID = rep(2, 20),
Week = sample(c(1,2,3,4), 20, replace = TRUE),
Treatment = sample(c("infected"), 20, replace = TRUE),
Step.count = rpois(20, 60.1))
data.3 <- data.frame(Cow.ID = rep(3, 20),
Week = sample(c(1,2,3,4), 20, replace = TRUE),
Treatment = sample(c("non-infected"), 20, replace = TRUE),
Step.count = rpois(20, 60.1))
data.4 <- data.frame(Cow.ID = rep(4, 20),
Week = sample(c(1,2,3,4), 20, replace = TRUE),
Treatment = sample(c("non-infected"), 20, replace = TRUE),
Step.count = rpois(20, 60.1))
sample.df <- rbind(data.1, data.2, data.3, data.4)
Hard to say without an example of your data, but assuming that you have a datafame something like this
library(dplyr)
cows <- tibble(
Cow.Id = rep(1:4, times = 5),
Week = rep(1:5, each = 4),
Step.count = floor(runif(20, 100,200)),
Treatment = rep(c('A','B','A','B'), times = 5),
)
Then, you can easily calculate a column of Step.count.offset for each cow like this:
cows.clean <- cows %>%
group_by(Cow.Id) %>%
arrange(Week) %>%
mutate(
Step.count.offset = Step.count - first(Step.count)
) %>%
ungroup()

R group by substring

Sample data
data = data.frame(id = c(1, 2, 3, 4, 5),
name = c("blue", "green", "red", "read", "HUE"),
WANT = c("ue", "re", "re", "re", "ue"))
To explain. If 'name' contains "ue", then WANT = "ue" and if 'name' contains 're' then WANT = "re". Capitalization does not matter.
This is my attempt:
df$attempt <- NA
df$attempt[substr(df$name) == "ue"] <- "ue"
df$attempt[substr(df$name) == "re"] <- "re"
A solution using stringr (part of the tidyverse).
library(tidyverse)
data2 <- data %>%
mutate(attempt = str_extract(name, pattern = regex("ue|re", ignore_case = TRUE)),
attempt = str_to_lower(attempt))
data2
# id name WANT attempt
# 1 1 blue ue ue
# 2 2 green re re
# 3 3 red re re
# 4 4 read re re
# 5 5 HUE ue ue
DATA
data = data.frame(id = c(1, 2, 3, 4, 5),
name = c("blue", "green", "red", "read", "HUE"),
WANT = c("ue", "re", "re", "re", "ue"))
Here is a couple of versions
data = data.frame(id = c(1, 2, 3, 4, 5),
name = c("blue", "green", "red", "read", "HUE"))
#base r version
data$want <- ifelse(grepl("ue", data$name, ignore.case = T), "ue",
ifelse(grepl("re", data$name, ignore.case = T), "re",
NA))
#tidyverse version
library(dplyr)
data <- data %>%
mutate(want = ifelse(grepl("ue", name, ignore.case = T), "ue",
ifelse(grepl("re", name, ignore.case = T), "re",
NA)))
Try using ifelse and mutate. grepl("ue",name,ignore.case = T) checks if ue or UE exists. Same logic applies to [re]
library(dplyr)
data = data%>%
mutate(Attempt = ifelse(grepl("ue",name,ignore.case = T),"ue",
ifelse(grepl("re",name,ignore.case = T),"re",NA)))
With purrr and dplyr:
library(dplyr)
library(purrr)
data %>%
mutate(group = map2_chr(WANT, name, ~ .x[grepl(.x, .y, ignore.case = TRUE)]))
Output:
id name WANT group
1 1 blue ue ue
2 2 green re re
3 3 red re re
4 4 read re re
5 5 HUE hu hu
Data:
data = data.frame(id = c(1, 2, 3, 4, 5),
name = c("blue", "green", "red", "read", "HUE"),
WANT = c("ue", "re", "re", "re", "hu"),
stringsAsFactors = FALSE)

Resources