I am having an issue multiplying 3 columns by 3 different constants (i.e, 2,3,4, respectively) and then summing each row after applying the conversion.
I am using dplyr
variable <- df %>% transmute(df, sum(col1, col2*2, col3*3, col4*4))
We could do
library(dplyr)
df %>%
mutate(a = a * 2,
b = b * 3,
c = c * 4,
total = a + b + c)
# a b c total
#1 2 18 44 64
#2 4 21 48 73
#3 6 24 52 82
#4 8 27 56 91
#5 10 30 60 100
Using rowSums
df %>%
mutate(a = a * 2,
b = b * 3,
c = c * 4) %>%
mutate(total = rowSums(.))
Important to note that if we are using rowSums, we need to include it in the new mutate call and not the same one otherwise it would sum the original df and not the changed one.
Or in base R
df1 <- transform(df, a = a*2, b = b * 3, c = c *4)
df1$total <- rowSums(df1)
data
df <- data.frame(a = 1:5, b = 6:10, c = 11:15)
In base R, we can do this more compactly with %*%
df$total <- c(as.matrix(df) %*% 2:4)
df
# a b c total
#1 1 6 11 64
#2 2 7 12 73
#3 3 8 13 82
#4 4 9 14 91
#5 5 10 15 100
Or with crossprod
df$total <- c(crossprod(t(df), 2:4))
--
Or with tidyverse
library(tidyverse)
map2(df, 2:4, ~ .x * .y) %>%
reduce(`+`) %>%
bind_cols(df, total = .)
data
df <- data.frame(a = 1:5, b = 6:10, c = 11:15)
variable <- df %>%
rowwise() %>%
mutate(new_var = sum(col1, col2*2, col3*3, col4*4))
Try that instead.
add rowwise() to have data analyzed at each row
use mutate() to get the new calculation
Related
I have this sort of data:
df <- data.frame(
id = sample(1:5, 100, replace = TRUE),
dur = sample(c(NA, rnorm(10)), 100, replace = TRUE),
char = sample(LETTERS, 100, replace = TRUE)
)
From this I can compute counts and proportions of the variable char:
library(dplyr)
df %>%
filter(!is.na(dur) & id != lag(id)) %>%
count(char, name = 'freq', sort = TRUE) %>%
mutate(prop = prop.table(freq) * 100)
char freq prop
1 C 6 8.571429
2 M 6 8.571429
3 X 5 7.142857
4 Y 5 7.142857
5 Z 5 7.142857
6 E 4 5.714286
7 I 4 5.714286
8 K 4 5.714286
9 J 3 4.285714
10 Q 3 4.285714
... clipped
Now, in df, the char values also have duration values. So I want to add another column, say mean_dur, with the mean dur values grouped by char in df. Adding on something like group_by(char) etc. to the above code doesn't work as the variable char is no longer recognized. How can that be achieved?
EDIT:
It can be done in steps, like this:
# Step 1 -- make df with counts and proportions:
df1 <- df %>%
filter(!is.na(dur) & id != lag(id)) %>%
count(char, name = 'freq', sort = TRUE) %>%
mutate(prop = prop.table(freq) * 100)
# Step 2 -- make another df with mean dur values:
df2 <- df %>%
filter(!is.na(dur) & id != lag(id)) %>%
group_by(char) %>%
summarise(mean_dur = mean(dur, na.rm = TRUE))
# Step 3 -- transfer mean dur values by matching `char`in `df1`and `df2`
df1$mean_dur <- df2$mean_dur[match(df1$char, df2$char)]
But is there a cleaner and tidyer dplyr way?
EDIT 2:
Thanks to #Anoushiravan R's solution, from which I picked the left_join idea, this seems like a clean and tidy solution (and it does not require the package janitor):
df %>%
filter(!is.na(dur) & id != lag(id)) %>%
count(char, name = 'freq', sort = TRUE) %>%
mutate(prop = prop.table(freq) * 100) %>%
left_join(df %>%
filter(!is.na(dur) & id != lag(id)) %>%
group_by(char) %>%
summarise(mean_dur = mean(dur)), by = "char")
I hope this is what you are looking for:
library(dplyr)
library(janitor)
df %>%
filter(!is.na(dur) & !id == lag(id)) %>%
tabyl(char) %>%
rename(freq = percent) %>%
mutate(freq = freq * 100) %>%
select(-n) %>%
arrange(desc(freq)) %>%
left_join(df %>%
filter(!is.na(dur) & id != lag(id)) %>%
group_by(char) %>%
summarise(mean_dur = mean(dur)), by = "char")
char freq mean_dur
T 7.894737 -0.4861708
Z 7.894737 -0.2867046
A 6.578947 -0.5056797
B 5.263158 0.3513478
E 5.263158 0.5113139
K 5.263158 -1.4560764
L 5.263158 0.8235192
N 5.263158 0.9037481
X 5.263158 -1.4669529
C 3.947368 -0.4064762
I 3.947368 -0.7722133
P 3.947368 -0.1076928
U 3.947368 0.5573875
Y 3.947368 0.2404896
D 2.631579 0.5942473
F 2.631579 1.2381883
G 2.631579 -0.2155605
J 2.631579 1.0528329
M 2.631579 -1.5482806
O 2.631579 0.2813264
S 2.631579 1.2132490
V 2.631579 0.6157874
H 1.315789 -1.2664754
Q 1.315789 1.1027114
R 1.315789 0.1288634
W 1.315789 1.0528329
If you're prepared to give up prop.table, then I think this gives you what you want...
df %>%
filter(!is.na(dur) & id != lag(id)) %>%
group_by(char) %>%
summarise(
n=n(),
prop = 100*n/nrow(.),
mean_dur=mean(dur, na.rm=TRUE),
.groups="drop"
)
# A tibble: 25 x 4
char n prop mean_dur
* <fct> <int> <dbl> <dbl>
1 A 6 8.82 0.158
2 B 5 7.35 -0.144
3 C 2 2.94 0.951
4 D 2 2.94 0.518
5 E 5 7.35 0.211
6 F 3 4.41 0.333
7 G 2 2.94 0.951
8 H 3 4.41 0.624
9 I 2 2.94 -0.422
10 J 2 2.94 -0.347
# … with 15 more rows
[It took me a while to notice you were working with random data. set.seed() would have been helpful! ;=) ]
Edited in line with comment below
Another option:
mean_dur <- df %>% group_by(char) %>% summarise(mean_dur=mean(dur,na.rm=T))
tab <- df %>%
filter(!is.na(dur) & id != lag(id)) %>%
count(char, name = 'freq') %>%
mutate(prop = prop.table(freq) * 100)
tab <- merge.data.frame(tab,mean_dur)
tab <- tab[order(tab$freq,decreasing = T),]
char freq prop mean_dur
17 R 6 8.108108 -0.75610907
3 D 5 6.756757 -0.61657511
5 F 5 6.756757 -0.34153689
10 K 5 6.756757 -0.90688768
19 T 5 6.756757 0.33628707
6 G 4 5.405405 -0.93390134
9 J 4 5.405405 0.27471673
11 L 4 5.405405 0.87029782
13 N 4 5.405405 0.17163797
16 Q 4 5.405405 -0.67554378
22 X 4 5.405405 -0.42108346
7 H 3 4.054054 0.36290234
14 O 3 4.054054 -0.56712470
15 P 3 4.054054 0.08316665
2 C 2 2.702703 -1.15398142
4 E 2 2.702703 -0.31271923
12 M 2 2.702703 -0.96001502
18 S 2 2.702703 -0.88921047
20 U 2 2.702703 0.24299241
21 W 2 2.702703 -1.32772406
1 A 1 1.351351 0.24299241
8 I 1 1.351351 -1.07336407
23 Z 1 1.351351 -1.07336407
I have the following example data.
data_1 <- data.frame("ID" = c('a','b','c','d','e'),
"value" = c(2,4,9,5,3))
data_2 <- data.frame("ID" = c('a','c','d','b','e','a','e','d','c'),
'var' =c(2,6,2,4,6,8,6,4,5))
I want to calculate new column in data_2 such that for the same ID in the two dataset, the value and var is multiplied.
Something like for data_1$ID==data_2$ID then data_1$value*data_2$var. So newVar would be (4,54,10,16,18,16,18,20,45).
Join the two dataframes and multiply value and var.
transform(merge(data_1, data_2, by = 'ID'), result = value * var)
You can also use match :
transform(data_2, result = var * data_1$value[match(ID, data_1$ID)])
# ID var result
#1 a 2 4
#2 c 6 54
#3 d 2 10
#4 b 4 16
#5 e 6 18
#6 a 8 16
#7 e 6 18
#8 d 4 20
#9 c 5 45
Using dplyr :
library(dplyr)
inner_join(data_1, data_2, by = 'ID') %>% mutate(result = value * var)
Using data.table
library(data.table)
setDT(data_1)[data_2, result := value * var, on = .(ID)]
There is my problem that I can't solve it:
Data:
df <- data.frame(f1=c("a", "a", "b", "b", "c", "c", "c"),
v1=c(10, 11, 4, 5, 0, 1, 2))
data.frame:f1 is factor
f1 v1
a 10
a 11
b 4
b 5
c 0
c 1
c 2
# What I want is:(for example, fetch data with the number of element of some level == 2, then to data.frame)
a b
10 4
11 5
Thanks in advance!
I might be missing something simple here , but the below approach using dplyr works.
library(dplyr)
nlevels = 2
df1 <- df %>%
add_count(f1) %>%
filter(n == nlevels) %>%
select(-n) %>%
mutate(rn = row_number()) %>%
spread(f1, v1) %>%
select(-rn)
This gives
# a b
# <int> <int>
#1 10 NA
#2 11 NA
#3 NA 4
#4 NA 5
Now, if you want to remove NA's we can do
do.call("cbind.data.frame", lapply(df1, function(x) x[!is.na(x)]))
# a b
#1 10 4
#2 11 5
As we have filtered the dataframe which has only nlevels observations, we would have same number of rows for each column in the final dataframe.
split might be useful here to split df$v1 into parts corresponding to df$f1. Since you are always extracting equal length chunks, it can then simply be combined back to a data.frame:
spl <- split(df$v1, df$f1)
data.frame(spl[lengths(spl)==2])
# a b
#1 10 4
#2 11 5
Or do it all in one call by combining this with Filter:
data.frame(Filter(function(x) length(x)==2, split(df$v1, df$f1)))
# a b
#1 10 4
#2 11 5
Here is a solution using unstack :
unstack(
droplevels(df[ave(df$v1, df$f1, FUN = function(x) length(x) == 2)==1,]),
v1 ~ f1)
# a b
# 1 10 4
# 2 11 5
A variant, similar to #thelatemail's solution :
data.frame(Filter(function(x) length(x) == 2, unstack(df,v1 ~ f1)))
My tidyverse solution would be:
library(tidyverse)
df %>%
group_by(f1) %>%
filter(n() == 2) %>%
mutate(i = row_number()) %>%
spread(f1, v1) %>%
select(-i)
# # A tibble: 2 x 2
# a b
# * <dbl> <dbl>
# 1 10 4
# 2 11 5
or mixing approaches :
as_tibble(keep(unstack(df,v1 ~ f1), ~length(.x) == 2))
Using all base functions (but you should use tidyverse)
# Add count of instances
x$len <- ave(x$v1, x$f1, FUN = length)
# Filter, drop the count
x <- x[x$len==2, c('f1','v1')]
# Hacky pivot
result <- data.frame(
lapply(unique(x$f1), FUN = function(y) x$v1[x$f1==y])
)
colnames(result) <- unique(x$f1)
> result
a b
1 10 4
2 11 5
I'd like code this, may it helps for you
library(reshape2)
library(dplyr)
aa = data.frame(v1=c('a','a','b','b','c','c','c'),f1=c(10,11,4,5,0,1,2))
cc = aa %>% group_by(v1) %>% summarise(id = length((v1)))
dd= merge(aa,cc) #get the level
ee = dd[dd$aa==2,] #select number of level equal to 2
ee$id = rep(c(1,2),nrow(ee)/2) # reset index like (1,2,1,2)
dcast(ee, id~v1,value.var = 'f1')
all done!
I am having problems transforming my data.
I have a dataframe, which tells which transitions were made, and how many times this sequence of transitions has occured. The different columns are corresponding with the situation in period 10, 11 and 12 (and there are more in my data). I want to summarize this data, and want to know how many times people went from A to C, A to D, but also C to G, etc. So basically I want to aggregate this data based on each column with the second column. My ultimate goal is to turn this data into a sankey diagram.
To illustrate:
df<-data.frame(s10=unlist(strsplit("AAAABBBBBC","")),
s11=unlist(strsplit("CCDDEEFFFF","")),
s12=unlist(strsplit("GHIGJKMNNN","")),
freq=c(10,20,30,40,50,60,70, 40, 20, 20))
s10 s11 s12 freq
1 A C G 10
2 A C H 20
3 A D I 30
4 A D G 40
5 B E J 50
6 B E K 60
7 B F M 70
8 B F N 40
9 B F N 20
10 C F N 20
And I aim to get this result:
colA colB freq
1 A C 30
2 A D 70
3 B E 110
4 B F 130
5 C F 20
6 C G 10
7 C H 20
8 D G 40
9 D I 30
10 E J 50
11 E K 60
12 F M 70
13 F N 80
I got this result by first aggregating the sum of frequencies for s10 and s11, and for s11 and s12, and then changing the column names and binding them together. It works for, but I intend to do this with more columns, so I'm sure there is a more efficient way to do this. See the code I used below:
bl1 <- df %>%
group_by(s10, s11) %>%
summarise(freq = sum(freq)) %>%
as.data.frame()
bl2 <- df %>%
group_by(s11, s12) %>%
summarise(freq = sum(freq)) %>%
as.data.frame()
colnames(bl1) <- c('colA', 'colB','freq' )
colnames(bl2) <- c('colA', 'colB','freq' )
rbind(bl1, bl2)
Any help is much appreciated!
You can rbind the selected columns of the data.frames together and then use aggregate. The only trick is to rename the columns so that they match. For this, I use setNames.
aggregate(freq ~ colA + colB,
data=rbind(setNames(df[c("s10", "s11", "freq")], c("colA", "colB", "freq")),
setNames(df[c("s11", "s12", "freq")], c("colA", "colB", "freq"))),
FUN=sum)
this returns the desired result.
colA colB freq
1 A C 30
2 A D 70
3 B E 110
4 B F 130
5 C F 20
6 C G 10
7 D G 40
8 C H 20
9 D I 30
10 E J 50
11 E K 60
12 F M 70
13 F N 80
OK. I gave it a go and had some fun with benchmarking. An alternate aproach (that I used) was to use aggregate() itself. See fun1 for implementation. I have made it to fit this particular example, and of course it will need tweaking to work with data frames of other widths
Edit: I have removed dataframe creation from functions and added Benchmarking output1
require(dplyr); require(microbenchmark)
df<-data.frame(s10=unlist(strsplit("AAAABBBBBC","")),
s11=unlist(strsplit("CCDDEEFFFF","")),
s12=unlist(strsplit("GHIGJKMNNN","")),
freq=c(10,20,30,40,50,60,70, 40, 20, 20))
fun0<- function(){
bl1 <- df %>%
group_by(s10, s11) %>%
summarise(freq = sum(freq)) %>%
as.data.frame()
bl2 <- df %>%
group_by(s11, s12) %>%
summarise(freq = sum(freq)) %>%
as.data.frame()
colnames(bl1) <- c('colA', 'colB','freq' )
colnames(bl2) <- c('colA', 'colB','freq' )
return(rbind(bl1, bl2))
}
fun1<- function(){
a<- apply(df[,c(1,2)], 1, function(x)paste(x[1],x[2], sep="",collapse = "" ))
b<- apply(df[,c(2,3)], 1, function(x)paste(x[1],x[2], sep="",collapse = "" ))
z<-data.frame(x=c(a,b),f=rep(df$freq,2))
return( aggregate( f~x , data=z, FUN=sum) )
}
fun0()
fun1()
#benchmarking
MB_res <- microbenchmark( fun0=fun0(), fun1=fun1() , times=1000)
MB_res
Results were:
Unit: milliseconds
expr min lq mean median uq max neval
fun0 2.218889 2.587820 2.773454 2.676921 2.785586 6.020277 1000
fun1 1.472971 1.737751 1.908966 1.842152 1.910118 8.915407 1000
Say that I have this data.frame, data:
data <- data.frame(val=c(rep(6,10), rep(7, 15), rep(8, 20), rep(9, 25), rep(10, 100), rep(11, 20), rep(12, 15), rep(13, 10)))
data$plus <- data$val + 100
My goal is to create a new data.frame that has the frequencies of each val, and the associated plus value.
My current strategy is to create a table (called table), then merge the frequencies. Then to keep only the first observation within each group:
table <- table(data$val)
df1 <- data.frame(val = as.integer(names(table)[1:length(table)]), N = table[1:length(table)])
df2 <- merge(data, df1)
df3 <- do.call(rbind, by(df2, list(df2$val), FUN=function(x) head(x, 1)))
This works, but it seems clunky.
In Stata, for example, it would be less and simpler code. Something like:
bys val plus: egen max = _N
bys val plus: gen first = _n==1
keep if first==1
Is there a way to simplify or make more elegant the R code?
Here's an approach using "data.table":
library(data.table)
as.data.table(data)[, N := .N, by = val][, .SD[1], by = val]
# val plus N
# 1: 6 106 10
# 2: 7 107 15
# 3: 8 108 20
# 4: 9 109 25
# 5: 10 110 100
# 6: 11 111 20
# 7: 12 112 15
# 8: 13 113 10
## Or (#RicardoSaporta)
as.data.table(data)[, list(.N, plus=plus[1]), by = val]
## Or (#DavidArenburg)
unique(as.data.table(data)[, N := .N, by = val], by = "val")
With "dplyr", you can try:
library(dplyr)
data %>%
group_by(val) %>%
mutate(N = n()) %>%
slice(1)
In base R, I guess you can try something like:
do.call(rbind, lapply(split(data, data$val),
function(x) cbind(x, N = nrow(x))[1, ]))
Edited
Or you can use aggregate()
data$N = 0
out = aggregate(N ~ val + plus, data = data, length)
or else
out = aggregate(plus ~val, data = data,function(x) c(unique(x), N = length(x)))
do.call(data.frame, out)
or using ddply
library(plyr)
out = ddply(data, .(val,plus), summarize, N = length(plus))
#> out
# val plus N
#1 6 106 10
#2 7 107 15
#3 8 108 20
#4 9 109 25
#5 10 110 100
#6 11 111 20
#7 12 112 15
#8 13 113 10