r count values in rows after dcast - r

I want to sum all values in a row of a dataframe after performing a dcast operation from the reshape2 package. Problem is that all values are the same (10) and are the sum of all rows combined. Values should be 4,2,4
Example data with code:
df <- data.frame(x = as.factor(c("A","A","A","A","B","B","C","C","C","C")),
y = as.factor(c("AA","AB","AA","AC","BB","BA","CC","CC","CC","CD")),
z = c("var1","var1","var2","var1","var2","var1","var1","var2","var2","var1"))
df2 <- df %>%
group_by(x,y) %>%
summarise(num = n()) %>%
ungroup()
df3 <- dcast(df2,x~y, fill = 0 )
df3$total <- sum(df3$AA,df3$AB,df3$AC,df3$BA,df3$BB,df3$CC,df3$CD)

sum gives you 1 combined value and that value is repeated for all other rows.
sum(df3$AA,df3$AB,df3$AC,df3$BA,df3$BB,df3$CC,df3$CD)
#[1] 10
You need rowSums to get sum of each row separately.
df3$total <- rowSums(df3[-1])
Here is a simplified tidyverse approach starting from df :
library(dplyr)
library(tidyr)
df %>%
count(x, y, name = 'num') %>%
pivot_wider(names_from = y, values_from = num, values_fill = 0) %>%
mutate(total = rowSums(select(., AA:CD)))
# x AA AB AC BA BB CC CD total
# <fct> <int> <int> <int> <int> <int> <int> <int> <dbl>
#1 A 2 1 1 0 0 0 0 4
#2 B 0 0 0 1 1 0 0 2
#3 C 0 0 0 0 0 3 1 4

We can specify the values_fn in pivot_wider and also use adorn_totals from janitor
library(dplyr)
library(tidyr)
library(janitor)
df %>%
pivot_wider(names_from = y, values_from = z, values_fill = 0,
values_fn = length) %>%
adorn_totals("col")
-output
# x AA AB AC BB BA CC CD Total
# A 2 1 1 0 0 0 0 4
# B 0 0 0 1 1 0 0 2
# C 0 0 0 0 0 3 1 4
Or using base R with xtabs and addmargins
addmargins(xtabs(z ~ x + y, transform(df, z = 1)), 2)
# y
#x AA AB AC BA BB CC CD Sum
# A 2 1 1 0 0 0 0 4
# B 0 0 0 1 1 0 0 2
# C 0 0 0 0 0 3 1 4

Related

Construct a variable name using a string and another variable in the same df and then update its value in R

I am trying to update variables in a df selected using a constructed name with a string and another variable in the same data frame.
Say my df is:
df
y index
1 4 1
2 8 5
3 4 3
4 6 2
to which I add five variables m.1 through m.5:
> df
y index m.1 m.2 m.3 m.4 m.5
1 4 1 0 0 0 0 0
2 8 5 0 0 0 0 0
3 4 3 0 0 0 0 0
4 6 2 0 0 0 0 0
using something like this:
createvars <- function(df, n) { mutate(df, "m.{n}" := 0) }
for(i in 1:max(df$index)) {df <- createvars(df, n=i)}
I want to update the variables m.1 through m.5 using something like this m.{n} := index based on conditions that look something like this {n} == index.
The result then should look like:
> df
y index m.1 m.2 m.3 m.4 m.5
1 4 1 1 0 0 0 0
2 8 5 0 0 0 0 5
3 4 3 0 0 3 0 0
4 6 2 0 0 0 4 0
Note, in the actual sample/application:
the number of variables m. will depend on the sample and can be several hundred,
the values in the variables m. will be functions of index and other variables in the df,
the df will ultimately be used in lm(y~) and this might not be the right way to proceed.
Any suggestion how to accomplish this?
Thanks tons for any suggestion!!!!
One dplyr and tibble solution could be:
df %>%
add_column(!!!setNames(rep(0, 5), paste0("m", 1:5))) %>%
mutate(across(starts_with("m"), ~ +(paste0("m", index) == cur_column()) * index))
y index m1 m2 m3 m4 m5
1 4 1 1 0 0 0 0
2 8 5 0 0 0 0 5
3 4 3 0 0 3 0 0
4 6 2 0 2 0 0 0
It is easier with row/column indexing in base R
# // create the 'm.' columns with 0 values
nm1 <- paste0('m.', 1:5)
df[nm1] <- 0
# // assign the elements that corresponds to row/column index with index
df[nm1][cbind(seq_len(nrow(df)), df$index)] <- df$index
-output
df
# y index m.1 m.2 m.3 m.4 m.5
#1 4 1 1 0 0 0 0
#2 8 5 0 0 0 0 5
#3 4 3 0 0 3 0 0
#4 6 2 0 2 0 0 0
Or another option is model.matrix from base R
df[nm1] <- model.matrix(~ factor(df$index, levels = 1:5) - 1) * df$index
Or modify the function createvars and use a for loop
library(stringr)
createvars <- function(data, n) {
data %>%
mutate(!! str_c('m.', n) := case_when(index == n ~ index, TRUE ~ 0L ))
}
for(i in seq_len(max(df$index))) df <- createvars(df, i)
Or another option with rowwise and unnest
library(tidyr)
library(dplyr)
mx <- max(df$index)
df %>%
rowwise %>%
mutate(new = list(replace(numeric(mx), index, index))) %>%
ungroup %>%
unnest_wider(c(new)) %>%
rename_at(vars(starts_with("..")), ~ str_c('m.', seq_along(.)))
data
df <- structure(list(y = c(4L, 8L, 4L, 6L), index = c(1L, 5L, 3L, 2L
)), class = "data.frame", row.names = c("1", "2", "3", "4"))
You can get the data in wide format after creating a dummy column.
library(dplyr)
library(tidyr)
df %>%
mutate(row = row_number(),
col = paste0('m', index),
index1 = index) %>%
complete(col = paste0('m', 1:max(index))) %>%
pivot_wider(names_from = col, values_from = index1, values_fill = 0) %>%
filter(!is.na(row)) %>%
arrange(row) %>%
select(-row)
# y index m1 m2 m3 m4 m5
# <int> <int> <int> <int> <int> <int> <int>
#1 4 1 1 0 0 0 0
#2 8 5 0 0 0 0 5
#3 4 3 0 0 3 0 0
#4 6 2 0 2 0 0 0

from column with factors to two different column with 0, 1

I have a column with group1 group 2 in data frame.
group <- c( "group1", "group1", "group2", "group1", "group2" )
value<- c(1:5)
dat <- data.frame(value, group)
I want to make it like this-
group1 <- c(1, 1, 0, 1, 0)
group2 <- c(0, 0, 1, 0, 1)
dat<- data.frame(value, group1, group2)
I tried this but have to remove the group column later
dat<- dat %>%
mutate( group1 = ifelse(data1$group =="group1", 1, 0 ),
group2 = ifelse(data1$group =="group2", 1, 0 ) )
Is there any other nice way to do this job.
Thanks in advance for your help.
You could create a dummy column and get data in wide format.
library(dplyr)
library(tidyr)
dat %>%
mutate(n = 1) %>%
pivot_wider(names_from = group, values_from = n, values_fill = 0) -> result
# value group1 group2
# <int> <dbl> <dbl>
#1 1 1 0
#2 2 1 0
#3 3 0 1
#4 4 1 0
#5 5 0 1
Or in base R use table :
table(dat)
# group
#value group1 group2
# 1 1 0
# 2 1 0
# 3 0 1
# 4 1 0
# 5 0 1
A base R option using reshape
replace(
out <- reshape(
cbind(dat, q = 1),
direction = "wide",
idvar = "value",
timevar = "group"
),
is.na(out),
0
)
giving
value q.group1 q.group2
1 1 1 0
2 2 1 0
3 3 0 1
4 4 1 0
5 5 0 1
We can use data.table
library(data.table)
dcast(setDT(dat), value ~ group, length)
# value group1 group2
#1: 1 1 0
#2: 2 1 0
#3: 3 0 1
#4: 4 1 0
#5: 5 0 1
Or this can be done with pivot_wider in a single step by specifying values_fn
library(dplyr)
library(tidyr)
dat %>%
pivot_wider(names_from = group, values_from = group,
values_fn = length, values_fill = 0)
# A tibble: 5 x 3
# value group1 group2
# <int> <int> <int>
#1 1 1 0
#2 2 1 0
#3 3 0 1
#4 4 1 0
#5 5 0 1
Insert %>% select(!"group") at the end of the dplyr pipe. Also remove data1$ from it - you probably meant dat, even that's not needed.
dat %>%
mutate(group1 = ifelse(group =="group1", 1, 0 ),
group2 = ifelse(group =="group2", 1, 0 )) %>%
select(!"group")
value group1 group2
1 1 1 0
2 2 1 0
3 3 0 1
4 4 1 0
5 5 0 1

Turn row values from multiple columns into column names in R?

I have a data frame that looks as follows:
state1 state1_pp state2 state2_pp state3 state3_pp
<chr> <chr> <chr> <chr> <chr> <chr>
1 0 0.995614 F 0.004386 NA 0
2 0 1 NA 0 NA 0
3 0 1 NA 0 NA 0
I want the values from each of the rows to be the column names the numeric values to be the row values:
0 F NA
<chr> <chr> <chr>
1 0.995614 0.004386 0
2 1 0 0
3 1 0 0
How do I do this in R?
Or a more complex scenario:
state1 state1_pp state2 state2_pp state3 state3_pp
1 0 0.995614 F 0.004386 NA 0
2 A 1 B 0 C 0
3 D 0.7 B 0.3 NA 0
This is what I want:
0 A D F B C NA
1 0.995614 0 0 0.004386 0 0 0
2 0 1 0 0 0 0 0
3 0 0 0.7 0 0.3 0 0
First a warning, having column names that are numeric (like 1) or are reserved R keywords (like NA) can cause you all sorts of errors. But if you must do it, I suggest the following:
library(dplyr)
# extract title row
headers <- df %>%
head(1) %>%
select(state1, state2, state3) %>%
unlist(use.names = FALSE) %>%
as.character()
# replace NA with "NA"
headers[is.na(headers)] = "NA"
# drop columns that are not wanted
new_df <- df %>%
select(-state1, -state2, -state3)
# replace column names
colnames(new_df) <- headers
In order to refer to your new columns you will probably need to use backticks: `
So with your new column names 0, F and NA you can call df$F but you can not call df$NA or df$1. Instead you will have to call df$`1` and df$`NA`.
Here's an attempt using dplyr and tidyr :
library(dplyr)
library(tidyr)
df %>%
mutate(row = row_number()) %>%
mutate_all(as.character) %>%
pivot_longer(cols = -row) %>%
mutate(name = sub('\\d+', '', name)) %>%
group_by(name, row) %>%
mutate(row1 = row_number()) %>%
pivot_wider() %>%
group_by(state, row) %>%
mutate(row1 = row_number()) %>%
pivot_wider(names_from = state, values_from = state_pp,
values_fill = list(state_pp = 0)) %>%
ungroup() %>%
select(-row, -row1)
# A tibble: 3 x 7
# `0` F `NA` A B C D
# <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#1 0.995614 0.004386 0 0 0 0 0
#2 0 0 0 1 0 0 0
#3 0 0 0 0 0.3 0 0.7

R: Frequency count but every category in a separate column

With the following code I assign a quantile rank y (from 1 to 4) for every value of x.
df$y <- ntile(df$x, 4)
Then, I would like to have four separate columns for absolute frequency count of every quantile rank, grouped also by variable z. With the following code, it does the calculation but I get all calculations in the same column.
df <-
df %>%
group_by(z, y) %>%
mutate(Freq = n())
example:
z y(quartile) n_quartile_4 n_quartile 3 n_quartile 2
1 4 2 1 0
1 3 2 1 0
1 4 2 1 0
2 2 0 0 3
2 2 0 0 3
2 2 0 0 3
We could create the count column with add_count, then pivot to 'wide' format with pivot_wider, fill the NA elements with the non-NA value in the column for each group and finally replace the rest of the NAs with 0
library(dplyr)
library(tidyr)
library(stringr)
df %>%
add_count(z, y) %>%
mutate(new = str_c('n_quartile_', y), rn = row_number()) %>%
pivot_wider(names_from = new, values_from = n) %>%
group_by(z) %>%
fill(starts_with('n_quartile'), .direction = 'downup') %>%
ungroup %>%
select(-rn) %>%
mutate_at(vars(starts_with('n_quartile')), replace_na, 0)
# A tibble: 6 x 5
# z y n_quartile_4 n_quartile_3 n_quartile_2
# <int> <dbl> <dbl> <dbl> <dbl>
#1 1 4 2 1 0
#2 1 3 2 1 0
#3 1 4 2 1 0
#4 2 2 0 0 3
#5 2 2 0 0 3
#6 2 2 0 0 3
data
df <- structure(list(z = c(1L, 1L, 1L, 2L, 2L, 2L), y = c(4, 3, 4,
2, 2, 2)), class = "data.frame", row.names = c(NA, -6L))

Count combinations between columns in data frame

I have a dataframe like this
V1 V2 V3
1 A A A
2 B A A
3 A B C
4 C A A
With this code i get another dataframe with all the possible combinations with "A", "B", "C".
library("gtools")
vars <- c("A", "B", "C")
combMatrix <- (combinations(n = 3, r = 2, repeats.allowed = T, v = vars))
combArray <- paste(combMatrix [,1], combMatrix [,2], sep="")
combDf <- expand.grid(combArray ,vars)
Then I want to count the combinations between a couple of two columns in the first dataframe(let's say V1 and V2) and the other column, and it's important to consider the concatenated characters (V1+V2 in this case) like "AB" and "BA" as the same combination.
The final data frame should look like this.
V1+V2 V3 Freq
AA A 1
AB A 1
AC A 1
BB A 0
BC A 0
CC A 0
AA B 0
AB B 0
AC B 0
BB B 0
BC B 0
CC B 0
AA C 0
AB C 1
AC C 0
BB C 0
BC C 0
CC C 0
Then I have to iterate the process for every combinations of columns (V1+V2/V3, V1+V3/V2, V2+V3/V1).
You can try this tidyverse solution.
First calculate all possible 18 combinations considering AB==BA for two variables using some magic and not-so-elegant code including map, unite, sort and paste together with rowwise.
library(tidyverse)
all_combs <- expand.grid(unique(unlist(d)),unique(unlist(d)),unique(unlist(d))) %>%
rowwise() %>%
mutate_all(as.character) %>%
mutate(two=paste(sort(c(Var1,Var2)), collapse="")) %>%
ungroup() %>%
unite(all, two, Var3) %>%
select(all) %>%
distinct()
Then the rest
combn(1:ncol(d),2, simplify = F) %>%
set_names(map(.,~paste(., collapse = "&"))) %>%
map(~select(d,a =.[1], b=.[2], everything()) %>%
rowwise() %>%
mutate_all(as.character) %>%
mutate(two=paste(sort(c(a, b)), collapse="")) %>%
select(two, contains("V"), -a,-b) %>%
ungroup() %>%
unite(all, two, contains("V")) %>%
count(all)) %>%
map(~right_join(.,all_combs, by="all")) %>%
bind_rows(.id = "id") %>%
mutate(n=ifelse(is.na(n), 0, n)) %>%
spread(id, n)
# A tibble: 18 x 4
all `1&2` `1&3` `2&3`
<chr> <dbl> <dbl> <dbl>
1 AA_A 1 1 1
2 AA_B 0 0 1
3 AA_C 0 0 1
4 AB_A 1 1 0
5 AB_B 0 0 0
6 AB_C 1 0 0
7 AC_A 1 1 0
8 AC_B 0 1 0
9 AC_C 0 0 0
10 BB_A 0 0 0
11 BB_B 0 0 0
12 BB_C 0 0 0
13 BC_A 0 0 1
14 BC_B 0 0 0
15 BC_C 0 0 0
16 CC_A 0 0 0
17 CC_B 0 0 0
18 CC_C 0 0 0

Resources