Related
I have a data frame with a subset of variables that starts with 'AA_' (e.g., AA_1, AA_2, ... AA_100) along with other variables X, Y, Z.
If I would like to get the produce of all 'AA_' variables, what would be the most efficient way in R to achieve this?
I am thinking something like
mydata = mydata %>%
mutate(AA_product = reduce(starts_with('AA_'), `*`))
but it does not quite work
Here, we need to select the data
library(dplyr)
library(purrr)
mydata %>%
mutate(AA_product = reduce(select(., starts_with( 'AA_')), `*`))
-output
# X Y Z AA_1 AA_2 AA_3 AA_product
#1 1 2 3 1 2 3 6
#2 2 3 4 2 3 4 24
#3 3 4 5 3 4 5 60
Another less efficient approach is rowwise with c_across
mydata %>%
rowwise() %>%
mutate(AA_prod = prod(c_across(starts_with('AA')))) %>%
ungroup
data
mydata <- data.frame(X = 1:3, Y = 2:4, Z = 3:5,
AA_1 = 1:3, AA_2 = 2:4, AA_3 = 3:5)
If you want row-wise product for "AA_" columns, you can do this in base R with Reduce :
cols <- grep('AA_', names(mydata))
mydata$AA_product <- Reduce(`*`, mydata[cols])
and apply :
mydata$AA_product <- apply(mydata[cols], 1, prod)
I do not understand the .funs argument to mutate_all() in the dplyr package. In all likelihood the problem lies with me but i would like to understand what I am missing.
I often have to recode multiple variables, like sets of likert items.
The sample code below replicates the problem I often have, and my own solution, but to me my solution does not look like the help documentation. So what am I missing?
#Data
var1<-sample(c('A', 'B', 'C'), 100, replace=T)
var2<-sample(c('A', 'B', 'C'), 100, replace=T)
dat<-data.frame(var1, var2)
library(tidyverse)
library(car)
#As per help documentation
dat %>%
mutate_all(., .funs(Recode(., "'A'=1"))) # This doesn't work, generates an error
#this works but the help documentation does not get you there in anyway, unless I am missing
something.
dat %>%
mutate_all(., funs(Recode(., "'A'=1")))
In the recent version of dplyr, list takes the place of funs i.e. wrapping with list instead of funs
library(dplyr) #v 0.8.3
library(car)
So, either
dat %>%
mutate_all(.funs = ~Recode(., "'A' = 1")) %>%
head(5)
# var1 var2
#1 B C
#2 B C
#3 B C
#4 B 1
#5 C C
Or
dat %>%
mutate_all(~ Recode(., "'A' = 1")) %>%
head(5)
# var1 var2
#1 B C
#2 B C
#3 B C
#4 B 1
#5 C C
Or even without the anonymous function call
dat %>%
mutate_all(Recode, "'A' = 1") %>%
head(5)
# var1 var2
#1 B C
#2 B C
#3 B C
#4 B 1
#5 C C
Suppose I want to create a mean variable in a given dataframe based on two vectors, one specifying the names of the variables to use, and one specifying weights by which these variables should go into the mean variable:
vars <- c("a", "b", "c","d"))
weights <- c(0.5, 0.7, 0.8, 0.2))
df <- data.frame(cbind(c(1,4,5,7), c(2,3,7,5), c(1,1,2,3),
c(4,5,3,3), c(3,2,2,1), c(5,5,7,1)))
colnames(df) <- c("a","b","c","d","e","f")
How could I use dplyr::mutate() to create a mean variable that uses vars and weights to calculate a rowwise score? mutate() should specifically use the variables supplied by vars
The result should basically do the following:
df <- df %>%
rowwise() %>%
mutate(comp = mean(c(vars[1]*weights[1], vars[2]*weights[2], ...)))
Written out:
df2 <- df %>%
rowwise() %>%
mutate(comp = mean(c(0.5*a, 0.7*b, 0.8*c, 0.2*d)))
I can't figure out how to do this because, although vars contains the exact variable names that I want to use for mutate in my df, inside vars they are strings. How could I make mutate() understand that the strings vars contains relate to columns in my df? If you know another procedure not using mutate() that's fine also. Thanks!
You may use
df %>% mutate(wmean = apply(.[vars], 1, weighted.mean, weights))
# a b c d e f mean
# 1 1 2 1 4 3 5 1.590909
# 2 4 3 1 5 2 5 2.681818
# 3 5 7 2 3 2 7 4.363636
# 4 7 5 3 3 1 1 4.545455
but there is not much to gain with tidyverse as base R approaches can be almost the same and end up being shorter:
df$wmean <- apply(df[vars], 1, weighted.mean, weights)
or one of the following:
df$wmean <- colSums(t(df[vars]) * weights) / sum(weights)
df$wmean <- as.matrix(df[vars]) %*% weights / sum(weights)
df$wmean <- rowSums(sweep(df[vars], 2, weights, `*`)) / sum(weights)
Row-wise operations can be a bit tricky in the tidyverse. This is a case where some base R knowledge can be really handy. For example, you can do it in one line with apply (note that I corrected a typo in the line that creates weights and drop columns e and f, which do not have weights):
vars <- c("a", "b", "c","d")
weights <- c(0.5, 0.7, 0.8, 0.2)
df <- data.frame(cbind(c(1,4,5,7), c(2,3,7,5), c(1,1,2,3),
c(4,5,3,3), c(3,2,2,1), c(5,5,7,1)))
colnames(df) <- c("a","b","c","d","e","f")
df$weighted.mean <- apply(df %>% select(-e, -f), 1, weighted.mean, weights)
a b c d e f weighted.mean
1 1 2 1 4 3 5 1.590909
2 4 3 1 5 2 5 2.681818
3 5 7 2 3 2 7 4.363636
4 7 5 3 3 1 1 4.545455
If you really wanted to do it in the tidyverse, this should get you started:
library(tidyverse)
df.weights <- data.frame(vars, weights)
df.new <- df %>%
mutate(row.num = 1:n()) %>%
gather(variable, value, -row.num) %>%
left_join(df.weights, by = c(variable = 'vars')) %>%
filter(variable %in% vars) %>%
group_by(row.num) %>%
mutate(weighted.mean = weighted.mean(value, weights))
There should be a tidyverse solution using pmap, but it eludes me. Here's another approach using tidyverse packages purrr and tibble
library(tidyverse)
vars <- c("a", "b", "c", "d")
weights <- c(0.5, 0.7, 0.8, 0.2)
df <- data.frame(cbind(c(1,4,5,7), c(2,3,7,5), c(1,1,2,3),
c(4,5,3,3), c(3,2,2,1), c(5,5,7,1)))
colnames(df) <- c("a","b","c","d","e","f")
df %>%
transpose() %>%
simplify_all() %>%
map_dbl(~weighted.mean(.x[vars], weights)) %>%
add_column(df, wmean = .)
#> a b c d e f wmean
#> 1 1 2 1 4 3 5 1.590909
#> 2 4 3 1 5 2 5 2.681818
#> 3 5 7 2 3 2 7 4.363636
#> 4 7 5 3 3 1 1 4.545455
Created on 2018-11-24 by the reprex package (v0.2.1)
There is my problem that I can't solve it:
Data:
df <- data.frame(f1=c("a", "a", "b", "b", "c", "c", "c"),
v1=c(10, 11, 4, 5, 0, 1, 2))
data.frame:f1 is factor
f1 v1
a 10
a 11
b 4
b 5
c 0
c 1
c 2
# What I want is:(for example, fetch data with the number of element of some level == 2, then to data.frame)
a b
10 4
11 5
Thanks in advance!
I might be missing something simple here , but the below approach using dplyr works.
library(dplyr)
nlevels = 2
df1 <- df %>%
add_count(f1) %>%
filter(n == nlevels) %>%
select(-n) %>%
mutate(rn = row_number()) %>%
spread(f1, v1) %>%
select(-rn)
This gives
# a b
# <int> <int>
#1 10 NA
#2 11 NA
#3 NA 4
#4 NA 5
Now, if you want to remove NA's we can do
do.call("cbind.data.frame", lapply(df1, function(x) x[!is.na(x)]))
# a b
#1 10 4
#2 11 5
As we have filtered the dataframe which has only nlevels observations, we would have same number of rows for each column in the final dataframe.
split might be useful here to split df$v1 into parts corresponding to df$f1. Since you are always extracting equal length chunks, it can then simply be combined back to a data.frame:
spl <- split(df$v1, df$f1)
data.frame(spl[lengths(spl)==2])
# a b
#1 10 4
#2 11 5
Or do it all in one call by combining this with Filter:
data.frame(Filter(function(x) length(x)==2, split(df$v1, df$f1)))
# a b
#1 10 4
#2 11 5
Here is a solution using unstack :
unstack(
droplevels(df[ave(df$v1, df$f1, FUN = function(x) length(x) == 2)==1,]),
v1 ~ f1)
# a b
# 1 10 4
# 2 11 5
A variant, similar to #thelatemail's solution :
data.frame(Filter(function(x) length(x) == 2, unstack(df,v1 ~ f1)))
My tidyverse solution would be:
library(tidyverse)
df %>%
group_by(f1) %>%
filter(n() == 2) %>%
mutate(i = row_number()) %>%
spread(f1, v1) %>%
select(-i)
# # A tibble: 2 x 2
# a b
# * <dbl> <dbl>
# 1 10 4
# 2 11 5
or mixing approaches :
as_tibble(keep(unstack(df,v1 ~ f1), ~length(.x) == 2))
Using all base functions (but you should use tidyverse)
# Add count of instances
x$len <- ave(x$v1, x$f1, FUN = length)
# Filter, drop the count
x <- x[x$len==2, c('f1','v1')]
# Hacky pivot
result <- data.frame(
lapply(unique(x$f1), FUN = function(y) x$v1[x$f1==y])
)
colnames(result) <- unique(x$f1)
> result
a b
1 10 4
2 11 5
I'd like code this, may it helps for you
library(reshape2)
library(dplyr)
aa = data.frame(v1=c('a','a','b','b','c','c','c'),f1=c(10,11,4,5,0,1,2))
cc = aa %>% group_by(v1) %>% summarise(id = length((v1)))
dd= merge(aa,cc) #get the level
ee = dd[dd$aa==2,] #select number of level equal to 2
ee$id = rep(c(1,2),nrow(ee)/2) # reset index like (1,2,1,2)
dcast(ee, id~v1,value.var = 'f1')
all done!
I have a data frame that's of this structure:
df <- data.frame(var1 = c(1,1,1,2,2,3,3,3,3),
cat1 = c("A","B","D","B","C","D","E","B","A"))`
> df
var1 cat1
1 1 A
2 1 B
3 1 D
4 2 B
5 2 C
6 3 D
7 3 E
8 3 B
9 3 A
And I am looking to create both nodes and edges data frames from it, so that I can draw a network graph, using VisNetwork. This network will show the number/strength of connections between the different cat1 values, as grouped by the var1 value.
I have the nodes data frame sorted:
nodes <- data.frame(id = unique(df$cat1))
> nodes
id
1 A
2 B
3 D
4 C
5 E
What I'd like help with is how to process df in the following manner:
for each distinct value of var1 in df, tally up the group of nodes that are common to that value of var1 to give an edges dataframe that ultimately looks like the one below. Note that I'm not bothered about the direction of flow along the edges. Just that they are connected is all I need.
> edges
from to value
1 A B 2
2 A D 2
3 A E 1
4 B C 1
5 B D 2
6 B E 1
7 D E 1
With thanks in anticipation,
Nevil
Update: I found here a similar problem, and have adapted that code to give, which is getting close to what I want, but not quite there...
> df %>% group_by(var1) %>%
filter(n()>=2) %>% group_by(var1) %>%
do(data.frame(t(combn(.$cat1, 2,function(x) sort(x))),
stringsAsFactors=FALSE))
# A tibble: 10 x 3
# Groups: var1 [3]
var1 X1 X2
<dbl> <chr> <chr>
1 1. A B
2 1. A D
3 1. B D
4 2. B C
5 3. D E
6 3. B D
7 3. A D
8 3. B E
9 3. A E
10 3. A B
I don't know if there is already a suitable function to achieve this task. Here is a detailed procedure to do it. Whith this, you should be able to define you own function. Hope it helps!
# create an adjacency matrix
mat <- table(df)
mat <- t(mat) %*% mat
as.table(mat) # look at your adjacency matrix
# since the network is not directed, we can consider only the (strictly) upper triangular matrix
mat[lower.tri(mat, diag = TRUE)] <- 0
as.table(mat) # look at the new adjacency matrix
library(dplyr)
edges <- as.data.frame(as.table(mat))
edges <- filter(edges, Freq != 0)
colnames(edges) <- c("from", "to", "value")
edges <- arrange(edges, from)
edges # output
# from to value
#1 A B 2
#2 A D 2
#3 A E 1
#4 B C 1
#5 B D 2
#6 B E 1
#7 D E 1
here's a couple other ways...
in base R...
values <- unique(df$var1[duplicated(df$var1)])
do.call(rbind,
lapply(values, function(i) {
nodes <- as.character(df$cat1[df$var1 == i])
edges <- combn(nodes, 2)
data.frame(from = edges[1, ],
to = edges[2, ],
value = i,
stringsAsFactors = F)
})
)
in tidyverse...
library(dplyr)
library(tidyr)
df %>%
group_by(var1) %>%
filter(n() >= 2) %>%
mutate(cat1 = as.character(cat1)) %>%
summarise(edges = list(data.frame(t(combn(cat1, 2)), stringsAsFactors = F))) %>%
unnest(edges) %>%
select(from = X1, to = X2, value = var1)
in tidyverse using tidyr::complete...
library(dplyr)
library(tidyr)
df %>%
group_by(var1) %>%
mutate(cat1 = as.character(cat1)) %>%
mutate(i.cat1 = cat1) %>%
complete(cat1, i.cat1) %>%
filter(cat1 < i.cat1) %>%
select(from = cat1, to = i.cat1, value = var1)
in tidyverse using tidyr::expand...
library(dplyr)
library(tidyr)
df %>%
group_by(var1) %>%
mutate(cat1 = as.character(cat1)) %>%
expand(cat1, to = cat1) %>%
filter(cat1 < to) %>%
select(from = cat1, to, value = var1)