I have a numeric data frame, and I'm trying to change each value in it to a string and then summarize it, proportionally, by that string.
Example table:
var1 var2 var3 var4
2 3 5 8
3 6 8 7
5 3 7 4
If a number is less than 4, I would like to replace it with "no", and if a number is 4 or greater, I would like to replace it with "yes"
I then would like to summarize the proportion of "yes" and "no" values for each column.
I've tried using dplyr, and it seems easy to do for a single column, but I'm having trouble applying it across all columns. I tried an approach like this, but didn't have success.
df %>%
select(var1:var4) %>%
mutate_each(funs(replace(., . < 4, "no")) %>%
mutate_each(funs(replace., . => 4, "yes")) %>%
summarise(count = n())
Thanks for any help.
library(tidyverse)
df <- data.frame(
var1 = c(2L, 3L, 5L),
var2 = c(3L, 6L, 3L),
var3 = c(5L, 8L, 7L),
var4 = c(8L, 7L, 4L)
)
df %>%
mutate(across(.fns = ~ . >= 4)) %>%
summarise(across(.fns = ~ sum(.)/length(.) ))
#> var1 var2 var3 var4
#> 1 0.3333333 0.3333333 1 1
We can loop across the columns, convert the values to 'no', 'yes', reshape to 'long' format with pivot_longer, get the count and divide by the sum of the count after grouping by 'name' to return the proportions
library(dplyr)
library(tidyr)
df %>%
mutate(across(everything(),
~ case_when(. < 4 ~ 'no', TRUE ~ 'yes'))) %>%
pivot_longer(everything()) %>%
count(name, value) %>%
group_by(name) %>%
mutate(prop = 100 * n/sum(n)) %>%
ungroup
-output
# A tibble: 6 x 4
name value n prop
<chr> <chr> <int> <dbl>
1 var1 no 2 66.7
2 var1 yes 1 33.3
3 var2 no 2 66.7
4 var2 yes 1 33.3
5 var3 yes 3 100
6 var4 yes 3 100
Or using base R
100 * proportions(table(names(df)[col(df)], c('no', 'yes')[(df >= 4) + 1]), 1)
no yes
var1 66.66667 33.33333
var2 66.66667 33.33333
var3 0.00000 100.00000
var4 0.00000 100.00000
data
df <- structure(list(var1 = c(2L, 3L, 5L), var2 = c(3L, 6L, 3L), var3 = c(5L,
8L, 7L), var4 = c(8L, 7L, 4L)), class = "data.frame", row.names = c(NA,
-3L))
Here is a similar answer to akrun's (which is the original) using ifelse and group_by and summarise:
df %>%
mutate(across(var1:var4, ~ifelse(. < 4, 'no', 'yes'))) %>%
pivot_longer(
cols = starts_with("var")) %>%
group_by(name, value) %>%
summarise(count = n()) %>%
mutate(prop = count/sum(count)*100)
Output:
name value count prop
<chr> <chr> <int> <dbl>
1 var1 no 2 66.7
2 var1 yes 1 33.3
3 var2 no 2 66.7
4 var2 yes 1 33.3
5 var3 yes 3 100
6 var4 yes 3 100
Related
I have a dataframe that is complicated and Im trying to reshape it.
Here is an example of the type of data frame that I have:
names <- c("var1", 'var2', "split")
values <- rnorm(8)
from <- data.frame(a = rep(1, 10),
b = c(rep(1,3), rep(2, 7)),
c = c(names, names, rep("split", 4)),
d = c(rep("NA", 5), names, rep("split", 2)),
e = c(rep("NA", 7), names),
f = c(values[1:2], "NA", values[3:8], "NA"))
And this produces something that looks like this:
> from
a b c d e f
1 1 1 var1 NA NA -0.271930473373158
2 1 1 var2 NA NA -0.0968100775823158
3 1 1 split NA NA NA
4 1 2 var1 NA NA -1.73919094720254
5 1 2 var2 NA NA -0.52398152119997
6 1 2 split var1 NA 0.856367467674763
7 1 2 split var2 NA -0.729762707907525
8 1 2 split split var1 0.561460771889416
9 1 2 split split var2 0.0432022687633195
10 1 2 split split split NA
Inside my data frame from, I want to take var1 and var2 and turn them into columns. And then use the value from column f in from as the values that correspond to var1 and var2 (reading row-wise).
In other words, I am trying to reshape this data frame into something that looks like this:
> out
a b var1 var2
1 1 1 -0.2719305 -0.09681008
2 1 2 -1.7391909 -0.52398152
3 1 2 0.8563675 -0.72976271
4 1 2 0.5614608 0.04320227
Any suggestions as to how I could do this?
We could reshape to 'long' with pivot_longer, remove the NA elements and filter by keeping on the 'var' elements and then back to 'wide' with pivot_wider
library(dplyr)
library(tidyr)
library(stringr)
library(data.table)
from %>%
type.convert(as.is = TRUE) %>%
pivot_longer(cols = c:e, values_drop_na = TRUE) %>%
filter(str_detect(value, 'var')) %>%
select(-name) %>%
mutate(rn = rowid(a, b, value)) %>%
pivot_wider(names_from = value, values_from = f) %>%
select(-rn)
-output
# A tibble: 4 × 4
a b var1 var2
<int> <int> <dbl> <dbl>
1 1 1 -0.272 -0.0968
2 1 2 -1.74 -0.524
3 1 2 0.856 -0.730
4 1 2 0.561 0.0432
data
from <- structure(list(a = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
b = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), c = c("var1",
"var2", "split", "var1", "var2", "split", "split", "split",
"split", "split"), d = c("NA", "NA", "NA", "NA", "NA", "var1",
"var2", "split", "split", "split"), e = c("NA", "NA", "NA",
"NA", "NA", "NA", "NA", "var1", "var2", "split"), f = c("-0.271930473373158",
"-0.0968100775823158", "NA", "-1.73919094720254", "-0.52398152119997",
"0.856367467674763", "-0.729762707907525", "0.561460771889416",
"0.0432022687633195", "NA")), row.names = c("1", "2", "3",
"4", "5", "6", "7", "8", "9", "10"), class = "data.frame")
Here is a solution with one time pivoting:
library(dplyr)
library(tidyr)
library(stringr)
from %>%
type.convert(as.is = TRUE) %>%
filter(!is.na(f)) %>%
mutate(name = str_extract_all(paste(c,d,e), 'var(.)')) %>%
select(a, b, f, name) %>%
pivot_wider(
names_from = name,
values_from = f,
values_fn = list
) %>%
unnest(cols = c(var1, var2))
a b var1 var2
<int> <int> <dbl> <dbl>
1 1 1 -0.272 -0.0968
2 1 2 -1.74 -0.524
3 1 2 0.856 -0.730
4 1 2 0.561 0.0432
This can be achieved by coupling a series of logical operations to get the values in from$f
data.frame( a=from$a[rowSums(from == "var1", na.rm=T) == 1],
b=from$b[rowSums(from == "var1", na.rm=T) == 1],
var1=from$f[rowSums(from == "var1", na.rm=T) == 1],
var2=from$f[rowSums(from == "var2", na.rm=T) == 1] )
a b var1 var2
1 1 1 -0.2719305 -0.09681008
2 1 2 -1.7391909 -0.52398152
3 1 2 0.8563675 -0.72976271
4 1 2 0.5614608 0.04320227
The notion is to have a row_number mutation:
library(dplyr)
library(tidyr)
from %>%
type.convert(as.is = TRUE) %>%
filter(!is.na(f)) %>%
group_by(name = invoke(coalesce, across(c:e, na_if, 'split')))%>%
mutate(id = row_number()) %>%
pivot_wider(c(a, b, id), values_from = f) %>%
select(-id)
# A tibble: 4 x 4
a b var1 var2
<int> <int> <dbl> <dbl>
1 1 1 -0.272 -0.0968
2 1 2 -1.74 -0.524
3 1 2 0.856 -0.730
4 1 2 0.561 0.0432
I have a dataset with various "chunks" of columns with different prefixes, but the same suffix:
ID
A034
B034
C034
D034
A099
B099
A123
B123
...
1
NA
1
NA
NA
NA
3
1
NA
...
2
2
NA
NA
NA
2
NA
NA
2
...
3
NA
NA
2
NA
NA
2
1
NA
...
The number of columns within each "chunk" also varies. Is there any way (other than manually, which is what I have been painstakingly doing with coalesce(!!! select(., contains("XXX")))) to automatically coalesce by chunk based on the shared suffix? That is, the result should resemble
ID
034
099
123
...
1
1
3
1
...
2
2
2
2
...
3
2
2
1
...
I'm not sure how to begin doing something like this, so any suggestions would be very helpful.
We reshape the data into 'long' format with pivot_longer, then we group by 'ID' and loop across the other columns, apply the na.omit to remove the NA elements (we assume that there is only one non-NA per each column by group)
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(cols = -ID, names_to = ".value",
names_pattern = "[A-Z](\\d+)") %>%
group_by(ID) %>%
summarise(across(everything(), na.omit), .groups = 'drop')
-output
# A tibble: 3 x 4
ID `034` `099` `123`
<int> <int> <int> <int>
1 1 1 3 1
2 2 2 2 2
3 3 2 2 1
Or to be safe, use complete.cases to create a logical vector for non-NA elements, and extract the first element (assuming we need only a single non-NA - if the non-NA lengths are different, we may need to return a list)
df1 %>%
pivot_longer(cols = -ID, names_to = ".value",
names_pattern = "[A-Z](\\d+)") %>%
group_by(ID) %>%
summarise(across(everything(), ~ .[complete.cases(.)][1]))
data
df1 <- structure(list(ID = 1:3, A034 = c(NA, 2L, NA), B034 = c(1L, NA,
NA), C034 = c(NA, NA, 2L), D034 = c(NA, NA, NA), A099 = c(NA,
2L, NA), B099 = c(3L, NA, 2L), A123 = c(1L, NA, 1L), B123 = c(NA,
2L, NA)), class = "data.frame", row.names = c(NA, -3L))
one more approach
library(tidyverse)
split(names(df1)[-1], gsub('^\\D*(\\d+)$', '\\1', names(df1)[-1])) %>% map(~df1[c('ID', .x)]) %>%
imap(~ .x %>% group_by(ID) %>% rowwise %>% transmute(!!.y := first(na.omit(c_across(everything())))) %>% ungroup) %>%
reduce(left_join, by = 'ID')
#> # A tibble: 3 x 4
#> ID `034` `099` `123`
#> <int> <int> <int> <int>
#> 1 1 1 3 1
#> 2 2 2 2 2
#> 3 3 2 2 1
Created on 2021-06-20 by the reprex package (v2.0.0)
This question already has an answer here:
dplyr::first() to choose first non NA value
(1 answer)
Closed 2 years ago.
I understand we can use the dplyr function coalesce() to unite different columns, but is there such function to unite rows?
I am struggling with a confusing incomplete/doubled dataframe with duplicate rows for the same id, but with different columns filled. E.g.
id sex age source
12 M NA 1
12 NA 3 1
13 NA 2 2
13 NA NA NA
13 F 2 NA
and I am trying to achieve:
id sex age source
12 M 3 1
13 F 2 2
You can try:
library(dplyr)
#Data
df <- structure(list(id = c(12L, 12L, 13L, 13L, 13L), sex = structure(c(2L,
NA, NA, NA, 1L), .Label = c("F", "M"), class = "factor"), age = c(NA,
3L, 2L, NA, 2L), source = c(1L, 1L, 2L, NA, NA)), class = "data.frame", row.names = c(NA,
-5L))
df %>%
group_by(id) %>%
fill(everything(), .direction = "down") %>%
fill(everything(), .direction = "up") %>%
slice(1)
# A tibble: 2 x 4
# Groups: id [2]
id sex age source
<int> <fct> <int> <int>
1 12 M 3 1
2 13 F 2 2
As mentioned by #A5C1D2H2I1M1N2O1R2T1 you can select the first non-NA value in each group. This can be done using dplyr :
library(dplyr)
df %>% group_by(id) %>% summarise(across(.fns = ~na.omit(.)[1]))
# A tibble: 2 x 4
# id sex age source
# <int> <fct> <int> <int>
#1 12 M 3 1
#2 13 F 2 2
Base R :
aggregate(.~id, df, function(x) na.omit(x)[1], na.action = 'na.pass')
Or data.table :
library(data.table)
setDT(df)[, lapply(.SD, function(x) na.omit(x)[1]), id]
I'd like to find consecutive month by client. I thought this is easy but
still can't find solutions..
My goal is to find months' consecutive purchases for each client. Any
My data
Client Month consecutive
A 1 1
A 1 2
A 2 3
A 5 1
A 6 2
A 8 1
B 8 1
In base R, we can use ave
df$consecutive <- with(df, ave(Month, Client, cumsum(c(TRUE, diff(Month) > 1)),
FUN = seq_along))
df
# Client Month consecutive
#1 A 1 1
#2 A 1 2
#3 A 2 3
#4 A 5 1
#5 A 6 2
#6 A 8 1
#7 B 8 1
In dplyr, we can create a new group with lag to compare the current month with the previous month and assign row_number() in each group.
library(dplyr)
df %>%
group_by(Client,group=cumsum(Month-lag(Month, default = first(Month)) > 1)) %>%
mutate(consecutive = row_number()) %>%
ungroup %>%
select(-group)
We can create a grouping variable based on the difference in adjacent 'Month' for each 'Client' and use that to create the sequence
library(dplyr)
df1 %>%
group_by(Client) %>%
group_by(grp =cumsum(c(TRUE, diff(Month) > 1)), add = TRUE) %>%
mutate(consec = row_number()) %>%
ungroup %>%
select(-grp)
# A tibble: 7 x 4
# Client Month consecutive consec
# <chr> <int> <int> <int>
#1 A 1 1 1
#2 A 1 2 2
#3 A 2 3 3
#4 A 5 1 1
#5 A 6 2 2
#6 A 8 1 1
#7 B 8 1 1
Or using data.table
library(data.table)
setDT(df1)[, grp := cumsum(c(TRUE, diff(Month) > 1)), Client
][, consec := seq_len(.N), .(Client, grp)
][, grp := NULL][]
data
df1 <- structure(list(Client = c("A", "A", "A", "A", "A", "A", "B"),
Month = c(1L, 1L, 2L, 5L, 6L, 8L, 8L), consecutive = c(1L,
2L, 3L, 1L, 2L, 1L, 1L)), class = "data.frame", row.names = c(NA,
-7L))
I have this sort of data (simplified obviously):
Var1 Var2 Var3
20 0.4 a
50 0.5 a
80 0.6 b
150 0.3 a
250 0.4 b
I want to group them according to Var1 if they fall into an interval of 50, then get the mean of Var1 and Var2, and keep Var3 as is if it's homogeneous, or rename it if the group has mixed labels. In this case I would get:
Var1 Var2 Var3
50 0.5 mixed
150 0.3 a
250 0.4 b
I'm guessing I should use the group_by function from dplyr package but I don't know how exactly. Thanks for your help!
Another dplyr possibility could be:
df %>%
group_by(grp = cumsum(Var1 - lag(Var1, default = first(Var1)) > 50)) %>%
summarise(Var1 = mean(Var1),
Var2 = mean(Var2),
Var3 = ifelse(n_distinct(Var3) > 1, "mixed", Var3)) %>%
ungroup() %>%
select(-grp)
Var1 Var2 Var3
<dbl> <dbl> <chr>
1 50 0.5 mixed
2 150 0.3 a
3 250 0.4 b
here's the dataframe with dput
d <- structure(list(Var1 = c(20L, 50L, 80L, 150L, 250L), Var2 = c(0.4,
0.5, 0.6, 0.3, 0.4), Var3 = structure(c(1L, 1L, 2L, 1L, 2L), .Label = c("a",
"b"), class = "factor")), class = "data.frame", row.names = c(NA,
-5L))
I'd
create a few temporary columns to determine when a new group starts
group and calculate mean, but also track distinct values of Var3
change to mix if more than one Var3 value in a group
in the tidyverse this might look like
d %>%
# make sure we sort Var1
arrange(Var1) %>%
# increment var1 by 50 and test that against the next row
# if the next value exceeds current by 50, we mark it as a new group
mutate(nextint=Var1+50,
newgroup=Var1>lag(nextint,default=-Inf),
grp=cumsum(newgroup)) %>%
# for each group, get the mean and a comma separated list of distinct Var3 values
group_by(grp) %>%
summarise(
grplbl=floor(max(Var1)/50)*50,
mu=mean(Var2),
mix=paste(collapse=",",unique(Var3))) %>%
# if mix (distinct Var3) has a comma in it, change from e.g. 'a,b' to 'mix'
mutate(mix=ifelse(grepl(',', mix), 'mixed', mix))
# A tibble: 3 x 4
grp grplbl mu mix
<int> <dbl> <dbl> <chr>
1 1 50 0.5 mixed
2 2 150 0.3 a
3 3 250 0.4 b