df <- data.frame(A = c(NA,5,4,NA,1),
B = c(1,NA,1,1,NA),
C = c(3,3,NA,NA,6),
D = c(0,0,1,1,1))
I have something like above dataset and trying to replace the NA values with the mean of the subgroup from target varibale D.
I tried the following code to replace them individually.
df <- df %>%
group_by(D) %>%
mutate(
A = ifelse(is.na(A),
mean(A, na.rm=TRUE),A)
) %>%
mutate(
B = ifelse(is.na(B),
mean(B, na.rm=TRUE),B)
) %>%
mutate(
C = ifelse(is.na(C),
mean(C, na.rm=TRUE),C)
)
Is there more efficent way to impute the mean values?
Perhaps this 'tidyverse' approach will suit:
library(tidyverse)
df <- data.frame(A = c(NA,5,4,NA,1),
B = c(1,NA,1,1,NA),
C = c(3,3,NA,NA,6),
D = c(0,0,1,1,1))
df_output <- df %>%
group_by(D) %>%
mutate(
A = ifelse(is.na(A),
mean(A, na.rm=TRUE),A)
) %>%
mutate(
B = ifelse(is.na(B),
mean(B, na.rm=TRUE),B)
) %>%
mutate(
C = ifelse(is.na(C),
mean(C, na.rm=TRUE),C)
)
df_output
#> # A tibble: 5 × 4
#> # Groups: D [2]
#> A B C D
#> <dbl> <dbl> <dbl> <dbl>
#> 1 5 1 3 0
#> 2 5 1 3 0
#> 3 4 1 6 1
#> 4 2.5 1 6 1
#> 5 1 1 6 1
df_output_2 <- df %>%
group_by(D) %>%
mutate(across(A:C, ~replace_na(.x, mean(.x, na.rm = TRUE))))
df_output_2
#> # A tibble: 5 × 4
#> # Groups: D [2]
#> A B C D
#> <dbl> <dbl> <dbl> <dbl>
#> 1 5 1 3 0
#> 2 5 1 3 0
#> 3 4 1 6 1
#> 4 2.5 1 6 1
#> 5 1 1 6 1
all_equal(df_output, df_output_2)
#> [1] TRUE
Created on 2022-10-04 by the reprex package (v2.0.1)
I encountered the same problem before but my dataset was bigger. In these cases, I use mutate_all
df %>% group_by(D) %>% mutate_all(funs(replace(., is.na(.), mean(., na.rm = TRUE))))
A B C D
<dbl> <dbl> <dbl> <dbl>
1 5 1 3 0
2 5 1 3 0
3 4 1 6 1
4 2.5 1 6 1
5 1 1 6 1
Related
I am trying to match these two data frames, nCode and grpRnk, as illustrated below.
Using the code shown at the bottom, I've been able to get the below output whereby the last column on the right shows the correct values, but I don't need those extra columns from grpRnk$Name through $subGrp (columns 6 -10) and I had specified a name for the last column of grpRnk but I instead get $grpRnk. Am I using this merge() function correctly? Is there a more correct or efficient way to do this sort of multi-factor matching?
Name Group nmCnt seqBase subGrp grpRnk$Name $Group $nmCnt $seqBase $subGrp $grpRnk
<chr> <dbl> <int> <int> <int> <chr> <dbl> <int> <int> <int> <int>
1 R 0 1 1 0 B 0 1 1 0 NA
2 R 0 2 2 0 R 0 1 1 0 NA
3 B 0 1 1 0 R 0 2 2 0 NA
4 R 0 3 3 0 R 0 3 3 0 NA
5 X 1 1 1 1 X 1 1 1 1 1
6 X 1 2 1 2 X 1 2 1 2 1
The code:
library(dplyr)
myDF1 <-
data.frame(
Name = c("R","R","B","R","X","X"),
Group = c(0,0,0,0,1,1)
)
nCode <- myDF1 %>%
group_by(Name) %>%
mutate(nmCnt = row_number()) %>%
ungroup() %>%
mutate(seqBase = ifelse(Group == 0 | Group != lag(Group), nmCnt,0)) %>%
mutate(seqBase = na_if(seqBase, 0)) %>%
group_by(Name) %>%
fill(seqBase) %>%
mutate(seqBase = match(seqBase, unique(seqBase))) %>%
ungroup() %>%
mutate(subGrp = as.integer(ifelse(Group > 0, sapply(1:n(), function(x) sum(Name[1:x]==Name[x] & Group[1:x] == Group[x])),0)))
grpRnk <- nCode %>% select(Name,Group,nmCnt) %>%
filter(Group > 0) %>%
group_by(Name,Group) %>%
slice(which.min(Group)) %>%
ungroup() %>%
arrange(nmCnt) %>%
mutate(grpRnk = dense_rank(nmCnt)) %>%
select (-nmCnt)
nCode %>% mutate(grpRnk = merge(nCode,grpRnk, by=c("Name","Group"), all.x=T))
You need to specify what column you want to extract into your newly created grpRnk column. You can achieve this by adding $grpRnk to the end of your merge() statement.
Here I have first a solution with merge():
library(tidyverse)
myDF1 <-
data.frame(
Name = c("R","R","B","R","X","X"),
Group = c(0,0,0,0,1,1)
)
nCode <- myDF1 %>%
group_by(Name) %>%
mutate(nmCnt = row_number()) %>%
ungroup() %>%
mutate(seqBase = ifelse(Group == 0 | Group != lag(Group), nmCnt,0)) %>%
mutate(seqBase = na_if(seqBase, 0)) %>%
group_by(Name) %>%
fill(seqBase) %>%
mutate(seqBase = match(seqBase, unique(seqBase))) %>%
ungroup() %>%
mutate(subGrp = as.integer(ifelse(Group > 0, sapply(1:n(), function(x) sum(Name[1:x]==Name[x] & Group[1:x] == Group[x])),0)))
grpRnk <- nCode %>% select(Name,Group,nmCnt) %>%
filter(Group > 0) %>%
group_by(Name,Group) %>%
slice(which.min(Group)) %>%
ungroup() %>%
arrange(nmCnt) %>%
mutate(grpRnk = dense_rank(nmCnt)) %>%
select (-nmCnt)
nCode %>% mutate(grpRnk = merge(nCode,grpRnk, by=c("Name","Group"), all.x = TRUE)$grpRnk)
#> # A tibble: 6 × 6
#> Name Group nmCnt seqBase subGrp grpRnk
#> <chr> <dbl> <int> <int> <int> <int>
#> 1 R 0 1 1 0 NA
#> 2 R 0 2 2 0 NA
#> 3 B 0 1 1 0 NA
#> 4 R 0 3 3 0 NA
#> 5 X 1 1 1 1 1
#> 6 X 1 2 1 2 1
However, you can achieve the same result using only merge() like this:
merge(nCode,grpRnk, by=c("Name","Group"), all.x = TRUE)
#> Name Group nmCnt seqBase subGrp grpRnk
#> 1 B 0 1 1 0 NA
#> 2 R 0 1 1 0 NA
#> 3 R 0 2 2 0 NA
#> 4 R 0 3 3 0 NA
#> 5 X 1 1 1 1 1
#> 6 X 1 2 1 2 1
And here is another solution using left_join() from the dplyr package:
left_join(nCode, grpRnk, by = c("Name", "Group"))
#> # A tibble: 6 × 6
#> Name Group nmCnt seqBase subGrp grpRnk
#> <chr> <dbl> <int> <int> <int> <int>
#> 1 R 0 1 1 0 NA
#> 2 R 0 2 2 0 NA
#> 3 B 0 1 1 0 NA
#> 4 R 0 3 3 0 NA
#> 5 X 1 1 1 1 1
#> 6 X 1 2 1 2 1
Created on 2022-09-19 with reprex v2.0.2
This is an example dataframe
means2 <- as.data.frame(matrix(runif(n=25, min=1, max=20), nrow=5))
names(means2) <- c("B_T0|B_T0", "B_T0|B_T1", "B_T0|Fibro_T0", "B_T5|Endo_T5", "Macro_T1|Fibro_T1")
I have column names in my dataframe in R in this format
\S+_T\d+|\S+_T\d+
The syntax is something like (Name)_ (T)(Number) | (Name)_ (T)(Number)
Step 1) I want to select columns which contain the same (T)(Number) on both sides of the "|"
I did this with some manual labor :
means_t0 <- means2 %>% select(matches("\\S+_T0\\|\\S+_T0")) %>% rownames_to_column("id_cp_interaction")
means_t1 <- means2 %>% select(matches("\\S+_T1\\|\\S+_T1")) %>% rownames_to_column("id_cp_interaction")
means_t5 <- means2 %>% select(matches("\\S+_T5\\|\\S+_T5")) %>% rownames_to_column("id_cp_interaction")
means3 <- full_join(means_t0, means_t1) %>% full_join(means_t5)
This gives me what I want and it was easy to do because I only had 3 types - T0, T1 and T5. What do I do if I had a huge number?
Step 2) From the output of Step1, I want to do a negation of the last question i.e. select only those columns with Names which are not the same
For example B_T0|B_T0 should be removed but B_T0|Fibro_T0 should be retained
Is there a way to regex capture the part in front of the pipe(|) and match it to the part at the back of the pipe(|)
Thank you
If you have that much information in your column names, I like to transform the data into the long format and then separate the info from the column name into several columns. Then it's easy to filter by these columns:
means2 <- as.data.frame(matrix(runif(n=25, min=1, max=20), nrow=5))
names(means2) <- c("B_T0|B_T0", "B_T0|B_T1", "B_T0|Fibro_T0", "B_T5|Endo_T5", "Macro_T1|Fibro_T1")
means2 <- cbind(data.frame(id_cp_interaction = 1:5), means2)
library(tidyr)
library(dplyr)
library(stringr)
res <- means2 %>%
pivot_longer(
cols = -id_cp_interaction,
names_to = "names",
values_to = "values"
) %>%
mutate(
celltype_1 = str_extract(names, "^[^_]*"),
timepoint_1 = str_extract(names, "[0-9](?=|)"),
celltype_2 = str_extract(names, "(?<=\\|)(.*?)(?=_)"),
timepoint_2 = str_extract(names, "[0-9]$")
)
head(res, n = 7)
#> # A tibble: 7 × 7
#> id_cp_interaction names values celltype_1 timepoint_1 celltype_2 timepoint_2
#> <int> <chr> <dbl> <chr> <chr> <chr> <chr>
#> 1 1 B_T0|B… 1.68 B 0 B 0
#> 2 1 B_T0|B… 19.3 B 0 B 1
#> 3 1 B_T0|F… 10.6 B 0 Fibro 0
#> 4 1 B_T5|E… 12.5 B 5 Endo 5
#> 5 1 Macro_… 2.84 Macro 1 Fibro 1
#> 6 2 B_T0|B… 2.17 B 0 B 0
#> 7 2 B_T0|B… 10.1 B 0 B 1
# only keep interactions of different cell types
res %>%
filter(celltype_1 != celltype_2) %>%
head()
#> # A tibble: 6 × 7
#> id_cp_interaction names values celltype_1 timepoint_1 celltype_2 timepoint_2
#> <int> <chr> <dbl> <chr> <chr> <chr> <chr>
#> 1 1 B_T0|F… 10.6 B 0 Fibro 0
#> 2 1 B_T5|E… 12.5 B 5 Endo 5
#> 3 1 Macro_… 2.84 Macro 1 Fibro 1
#> 4 2 B_T0|F… 1.47 B 0 Fibro 0
#> 5 2 B_T5|E… 11.3 B 5 Endo 5
#> 6 2 Macro_… 13.0 Macro 1 Fibro 1
Created on 2022-09-19 by the reprex package (v1.0.0)
I found solutions for simple vectors, but is there a way to make all pairwise differences using dplyr or base R for all the elements in a category?
library(tidyverse)
x = 1:10
y = rep(letters[1:5],each=2)
z = rep(1:2,length.out =10)
df = data.frame(x,y, z)
df = rbind(df,c(11,"e",3))
df$verif = paste0(df$y,df$z)
df$x = as.numeric(df$x)
df %>%
group_by(y) %>%
summarise(Diff = abs(x - lag(x)))
gives:
`summarise()` regrouping output by 'y' (override with `.groups` argument)
# A tibble: 11 x 2
# Groups: y [5]
y Diff
<chr> <dbl>
1 a NA
2 a 1
3 b NA
4 b 1
5 c NA
6 c 1
7 d NA
8 d 1
9 e NA
10 e 1
11 e 1
In this example, it's only using the previous value in the data frame, therefore missing pairwise differences (look at 9, 10 and 11 for group "e" ).
Is there a way to get all the pairwise differences in each category? Keeping track of the pairwise differences would be useful as well (e.g., e1 with e2 = 1, e2 with e3 is = 1 and e1 with e3 is =2)
I tired the outer() function but wasn't able to make it work as well as the dist() function.
I continued to try and found this:
my.df=df %>%
group_by(y) %>%
summarise(Diff = combn(x,2,diff))
my.df
# A tibble: 7 x 2
# Groups: y [5]
y Diff
<chr> <dbl>
1 a 1
2 b 1
3 c 1
4 d 1
5 e 1
6 e 2
7 e 1
I just now need to get which pairwise difference was calculated...
Continued again and got this mess:
my.df=df %>%
group_by(y) %>%
summarise(Diff = combn(x,2,diff),
test = combn(verif,2,paste, simplify = FALSE)) %>%
mutate(test2 = paste0(test, collapse = "-"))
my.df
> my.df
# A tibble: 7 x 4
# Groups: y [5]
y Diff test test2
<chr> <dbl> <list> <chr>
1 a 1 <chr [2]> "c(\"a1\", \"a2\")"
2 b 1 <chr [2]> "c(\"b1\", \"b2\")"
3 c 1 <chr [2]> "c(\"c1\", \"c2\")"
4 d 1 <chr [2]> "c(\"d1\", \"d2\")"
5 e 1 <chr [2]> "c(\"e1\", \"e2\")-c(\"e1\", \"e3\")-c(\"e2\", \"e3\")"
6 e 2 <chr [2]> "c(\"e1\", \"e2\")-c(\"e1\", \"e3\")-c(\"e2\", \"e3\")"
7 e 1 <chr [2]> "c(\"e1\", \"e2\")-c(\"e1\", \"e3\")-c(\"e2\", \"e3\")"
Got it:
library(tidyverse)
x = 1:10
y = rep(letters[1:5],each=2)
z = rep(1:2,length.out =10)
df = data.frame(x,y, z)
df = rbind(df,c(11,"e",3))
df$verif = paste0(df$y,df$z)
df$x = as.numeric(df$x)
my.df=df %>%
group_by(y) %>%
summarise(Diff = combn(x,2,diff),
test = combn(verif,2,paste, simplify = FALSE)) %>%
mutate(test2 = unlist(lapply(test, function(x)paste(x,collapse="-")))) %>%
select(-test)
Here is the output
my.df
# A tibble: 7 x 3
# Groups: y [5]
y Diff test2
<chr> <dbl> <chr>
1 a 1 a1-a2
2 b 1 b1-b2
3 c 1 c1-c2
4 d 1 d1-d2
5 e 1 e1-e2
6 e 2 e1-e3
7 e 1 e2-e3
You could do:
library(tidyverse)
df %>%
group_by(y) %>%
summarise(result = combn(seq_along(x), 2, function(i)
list(test1 = diff(x[i]), #The difference
test2 = paste0(verif[i], collapse = '-')), # The pairs
simplify = FALSE),
.groups = 'drop') %>%
unnest_wider(result)
# A tibble: 7 x 3
y test1 test2
<chr> <dbl> <chr>
1 a 1 a1-a2
2 b 1 b1-b2
3 c 1 c1-c2
4 d 1 d1-d2
5 e 1 e1-e2
6 e 2 e1-e3
7 e 1 e2-e3
I have a number of large data frames that have the following basic format, where the final two rows are a mean (d) and standard deviation (e) - although these are calculated elsewhere.
a b c
a 4 3 4
b 3 2 6
c 2 1 8
d 3 2 6
e 1 1 2
I would like to create an iterative function that converts each raw data point into a z-score via the mean and sd value in d and e per column. The formula I would like to apply is ((x-mean)/SD).
The result would be the following:
a b c
a 1 1 1
b 0 0 0
c -1 -1 -1
I don't mind if this is added to the end, created as a new dataframe or the data is converted.
Thanks!
Here is one approach, note that I do not use the mean/sd provided in the data but re-calculate it on the fly.
Also note that usually the data should be in a tidy data representation, which in your case would mean that a, b, c would be in columns and then mean/sd would be either calculated on the fly or be in a separate column (note that this would reshaping the data, not shown here).
# your input data
raw_data <- data.frame(
a = c(4, 3, 2, 3, 1),
b = c(3, 2, 1, 2, 1),
c = c(4, 6, 8, 6, 2),
row.names = c("a", "b", "c", "d", "e")
)
raw_data
#> a b c
#> a 4 3 4
#> b 3 2 6
#> c 2 1 8
#> d 3 2 6
#> e 1 1 2
# remove the mean/sd values
data <- raw_data[!rownames(raw_data) %in% c("d", "e"), ]
data
#> a b c
#> a 4 3 4
#> b 3 2 6
#> c 2 1 8
# quick way to recalculate the values
means <- apply(data, 2, mean)
means
#> a b c
#> 3 2 6
sds <- apply(data, 2, sd)
sds
#> a b c
#> 1 1 2
z_scores <- apply(data, 2, function(x) (x - mean(x)) / sd(x))
z_scores
#> a b c
#> a 1 1 -1
#> b 0 0 0
#> c -1 -1 1
Created on 2021-01-07 by the reprex package (v0.3.0)
Edit / Full Code
The following code is a bit longer but most of it is spent on getting the data into the right (long/tidy) format.
If you have any questions, feel free to use the comments.
Note that the tidyverse is really helpful, but might need some time to get used to. The code used here is mostly dplyr (included in the tidyverse).
If you understand the functions: %>% (pipe), group_by(), mutate(), summarise(), and pivot_longer/wider() you got everything.
library(tidyverse)
# use your original dataset again
raw_data <- data.frame(
a = c(4, 3, 2, 3, 1),
b = c(3, 2, 1, 2, 1),
c = c(4, 6, 8, 6, 2),
row.names = c("a", "b", "c", "d", "e")
)
### 1) Turn the data into a nicer format
# match-table how to rename the variables
var_match <- c(d = "mean", e = "sd")
# convert the raw data into a nicer format, first we do some minor changes
# (variable names, etc)
data_mixed <- raw_data %>%
# have the rownames as explicit variable
rownames_to_column("metric") %>%
# nicer printing etc
as_tibble() %>%
# replace variable names with mean/sd
mutate(metric = ifelse(metric %in% c("d", "e"),
var_match[metric], metric))
data_mixed
#> # A tibble: 5 x 4
#> metric a b c
#> <chr> <dbl> <dbl> <dbl>
#> 1 a 4 3 4
#> 2 b 3 2 6
#> 3 c 2 1 8
#> 4 mean 3 2 6
#> 5 sd 1 1 2
# separate the dataset into two:
# data holds the values
# data_vars holds the metrics mean and sd
data <- data_mixed %>% filter(!metric %in% var_match) %>% select(-metric)
data_vars <- data_mixed %>% filter(metric %in% var_match)
data
#> # A tibble: 3 x 3
#> a b c
#> <dbl> <dbl> <dbl>
#> 1 4 3 4
#> 2 3 2 6
#> 3 2 1 8
data_vars
#> # A tibble: 2 x 4
#> metric a b c
#> <chr> <dbl> <dbl> <dbl>
#> 1 mean 3 2 6
#> 2 sd 1 1 2
# turn the value dataset into its longer form, makes it easier to work with it later
data_long <- data %>%
pivot_longer(everything(), names_to = "var", values_to = "val")
data_long
#> # A tibble: 9 x 2
#> var val
#> <chr> <dbl>
#> 1 a 4
#> 2 b 3
#> 3 c 4
#> 4 a 3
#> 5 b 2
#> 6 c 6
#> 7 a 2
#> 8 b 1
#> 9 c 8
# turn the metric dataset into another long form, allowing easy combination in the next step
data_vars2 <- data_vars %>%
pivot_longer(-metric, names_to = "var", values_to = "val") %>%
pivot_wider(var, names_from = metric, values_from = val)
data_vars2
#> # A tibble: 3 x 3
#> var mean sd
#> <chr> <dbl> <dbl>
#> 1 a 3 1
#> 2 b 2 1
#> 3 c 6 2
# combine the datasets
data_all <- left_join(data_long, data_vars2, by = "var")
data_all
#> # A tibble: 9 x 4
#> var val mean sd
#> <chr> <dbl> <dbl> <dbl>
#> 1 a 4 3 1
#> 2 b 3 2 1
#> 3 c 4 6 2
#> 4 a 3 3 1
#> 5 b 2 2 1
#> 6 c 6 6 2
#> 7 a 2 3 1
#> 8 b 1 2 1
#> 9 c 8 6 2
## 2) calculate the z-score
# now comes the actual number crunchin!
# per variable var (a, b, c) compute the variable val_z as the z-score
data_res <- data_all %>%
group_by(var) %>%
mutate(val_z = (val - mean) / sd)
data_res
#> # A tibble: 9 x 5
#> # Groups: var [3]
#> var val mean sd val_z
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 a 4 3 1 1
#> 2 b 3 2 1 1
#> 3 c 4 6 2 -1
#> 4 a 3 3 1 0
#> 5 b 2 2 1 0
#> 6 c 6 6 2 0
#> 7 a 2 3 1 -1
#> 8 b 1 2 1 -1
#> 9 c 8 6 2 1
## 3) make the results more readable
# lastly pivot the results to its original form
data_res_wide <- data_res %>%
select(var, val_z) %>%
group_by(var) %>%
mutate(id = 1:n()) %>% # needed for easier identification of values
pivot_wider(id, names_from = var, values_from = val_z)
data_res_wide
#> # A tibble: 3 x 4
#> id a b c
#> <int> <dbl> <dbl> <dbl>
#> 1 1 1 1 -1
#> 2 2 0 0 0
#> 3 3 -1 -1 1
Created on 2021-01-07 by the reprex package (v0.3.0)
I'm trying to assess which unit in a pair is the "winner". group_by() %>% mutate() is close to the right thing, but it's not quite there. in particular
dat %>% group_by(pair) %>% mutate(winner = ifelse(score[1] > score[2], c(1, 0), c(0, 1))) doesn't work.
The below does, but is clunky with an intermediate summary data frame. Can we improve this?
library(tidyverse)
set.seed(343)
# units within pairs get scores
dat <-
data_frame(pair = rep(1:3, each = 2),
unit = rep(1:2, 3),
score = rnorm(6))
# figure out who won in each pair
summary_df <-
dat %>%
group_by(pair) %>%
summarize(winner = which.max(score))
# merge back and determine whether each unit won
dat <-
left_join(dat, summary_df, "pair") %>%
mutate(won = as.numeric(winner == unit))
dat
#> # A tibble: 6 x 5
#> pair unit score winner won
#> <int> <int> <dbl> <int> <dbl>
#> 1 1 1 -1.40 2 0
#> 2 1 2 0.523 2 1
#> 3 2 1 0.142 1 1
#> 4 2 2 -0.847 1 0
#> 5 3 1 -0.412 1 1
#> 6 3 2 -1.47 1 0
Created on 2018-09-26 by the reprex
package (v0.2.0).
maybe related to Weird group_by + mutate + which.max behavior
You could do:
dat %>%
group_by(pair) %>%
mutate(won = score == max(score),
winner = unit[won == TRUE]) %>%
# A tibble: 6 x 5
# Groups: pair [3]
pair unit score won winner
<int> <int> <dbl> <lgl> <int>
1 1 1 -1.40 FALSE 2
2 1 2 0.523 TRUE 2
3 2 1 0.142 TRUE 1
4 2 2 -0.847 FALSE 1
5 3 1 -0.412 TRUE 1
6 3 2 -1.47 FALSE 1
Using rank:
dat %>% group_by(pair) %>% mutate(won = rank(score) - 1)
More for fun (and slightly faster), using the outcome of the comparison (score[1] > score[2]) to index a vector with 'won alternatives' :
dat %>% group_by(pair) %>%
mutate(won = c(0, 1, 0)[1:2 + (score[1] > score[2])])