Count occurrences of elements for a set of intervals in R - r

Lets say I have data like this
Number
ID
1.5
X
2.4
X
3.1
Y
3.2
Y
My desired output is
ID
1 < x < 2
2 < x < 3
3 < x < 4
X
1
1
0
Y
0
0
2
What would be an efficient approach to creating this?

We can create a column with cut and then use pivot_wider to reshape from 'long' to 'wide' format
library(dplyr)
library(tidyr)
df1 %>%
mutate(grp = cut(Number, breaks = c(1, 2, 3, 4),
labels = c("1<x<2", "2<x<3", "3<x<4"))) %>%
pivot_wider(names_from = grp, values_from= Number,
values_fn = length, values_fill = 0)
-output
# A tibble: 2 × 4
ID `1<x<2` `2<x<3` `3<x<4`
<chr> <int> <int> <int>
1 X 1 1 0
2 Y 0 0 2
data
df1 <- structure(list(Number = c(1.5, 2.4, 3.1, 3.2), ID = c("X", "X",
"Y", "Y")), class = "data.frame", row.names = c(NA, -4L))

Related

how to duplicate rows with certain condition and create anew variable at the same time

I have a df like below and I would like to transfer it to sth like the table on the right, how can I duplicate the rows with Type=="N" and add new var Grade?
Basically, if Type==N, then Grade can be S or W, that is why we need to duplicate the rows.
df<-structure(list(Type = c("N", "N", "S", "W"), Result = c(8, 9,
7, 6)), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"
))
Using some functions from tidyverse, you can use crossing to duplicate rows and add the "Grade" column at the same time, then filter to match your stated rules.
library(tidyverse)
result <- df %>%
crossing(data.frame(Grade = c('S', 'W'))) %>%
filter(Type == 'N' | Type == Grade)
Type Result Grade
<chr> <dbl> <chr>
1 N 8 S
2 N 8 W
3 N 9 S
4 N 9 W
5 S 7 S
6 W 6 W
I think this approach is extensible to many more conditions assuming yours is the minimal example and you have a larger more complicated dataset.
library(dplyr)
df<-structure(list(Type = c("N", "N", "S", "W"), Result = c(8, 9,
7, 6)), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"
))
df2 <- data.frame(Type2 = c("N", "N"), Grade = c("S", "W"))
df %>%
select(Type, Result) %>%
left_join(df2, by = c("Type" = "Type2")) %>%
mutate(Grade = case_when(Type == "S" ~ "S", Type == "W" ~ "W", TRUE ~ Grade))
Type Result Grade
<chr> <dbl> <chr>
1 N 8 S
2 N 8 W
3 N 9 S
4 N 9 W
5 S 7 S
6 W 6 W
Another option is to use if_else() (or case_when() if there are more complex conditions) to return a list column of multiple values and unnest:
library(dplyr)
library(tidyr)
df %>%
mutate(Grade = if_else(Type == "N", list(c("S", "W")), as.list(Type))) %>%
unnest(Grade)
# A tibble: 6 x 3
Type Result Grade
<chr> <dbl> <chr>
1 N 8 S
2 N 8 W
3 N 9 S
4 N 9 W
5 S 7 S
6 W 6 W
Or:
df %>%
mutate(Grade = case_when(Type == "N" ~ list(c("S", "W")),
TRUE ~ as.list(Type))) %>%
unnest(Grade)
A dplyr way:
We could use bind_rows after using slice.
library(dplyr)
df %>%
slice(1:2) %>%
bind_rows(df) %>%
group_by(Type) %>%
arrange(Result, .by_group = TRUE) %>%
ungroup() %>%
mutate(Grade = rep(c("S","W"),length.out = n()), .before=2)
Type Grade Result
<chr> <chr> <dbl>
1 N S 8
2 N W 8
3 N S 9
4 N W 9
5 S S 7
6 W W 6
Here is a possible data.table option:
library(data.table)
dt <- as.data.table(df)
output <- dt[, CJ(.SD$Type, c('S', 'W')), .(Result)][which(V1 == 'N' | V1 == V2), ]
setnames(output, c(names(dt), "Grade"))
setcolorder(output, c("Result", "Grade", "Type"))
Output
Result Grade Type
1: N S 8
2: N W 8
3: N S 9
4: N W 9
5: S S 7
6: W W 6

R: how to combine rows using Pivot_wider

I have a table like the following:
A, B, C
1, Yes, 3
1, No, 2
2, Yes, 4
2, No, 6
etc
I want to convert it to:
A, Yes, No
1, 3, 2
2, 4, 6
I have tried using:
dat <- dat %>%
spread(B, C) %>%
group_by(A)
However, now I have a bunch of NA values. Is it possible to use pivot_longer to do this instead?
We can use pivot_wider
library(tidyr)
pivot_wider(dat, names_from = B, values_from = C)
-output
# A tibble: 2 x 3
# A Yes No
# <dbl> <dbl> <dbl>
#1 1 3 2
#2 2 4 6
If there are duplicate rows, then an option is to create a sequence by that column
library(data.table)
library(dplyr)
dat1 <- bind_rows(dat, dat) # // example with duplicates
dat1 %>%
mutate(rn = rowid(B)) %>%
pivot_wider(names_from = B, values_from = C) %>%
select(-rn)
-output
# A tibble: 4 x 3
# A Yes No
# <dbl> <dbl> <dbl>
#1 1 3 2
#2 2 4 6
#3 1 3 2
#4 2 4 6
data
dat <- structure(list(A = c(1, 1, 2, 2), B = c("Yes", "No", "Yes", "No"
), C = c(3, 2, 4, 6)), class = "data.frame", row.names = c(NA,
-4L))

Convert logical data from wide to long format in R

I have the following data:
ID cancer cancer_date stroke stroke_date diabetes diabetes_date
1 1 Feb2017 0 Jan2015 1 Jun2015
2 0 Feb2014 1 Jan2015 1 Jun2015
I would like to get
ID condition date
1 cancer xx
1 diabetes xx
2 stroke xx
2 diabetes xx
I tried reshape and gather, but it did not do what I want. Any ideas how can I do this?
This should do it. The key to make it work easily is to change the names of cancer, stroke and diabetes to x_val and then you can use pivot_longer() from tidyr to do the work.
library(tidyr)
library(dplyr)
dat <- tibble::tribble(
~ID, ~cancer, ~cancer_date, ~stroke, ~stroke_date, ~diabetes, ~diabetes_date,
1, 1, "Feb2017", 0, "Jan2015", 1, "Jun2015",
2, 0, "Feb2014", 1, "Jan2015", 1, "Jun2015")
dat %>%
rename("cancer_val" = "cancer",
"stroke_val" = "stroke",
"diabetes_val" = "diabetes") %>%
pivot_longer(cols=-ID,
names_to = c("diagnosis", ".value"),
names_pattern="(.*)_(.*)") %>%
filter(val == 1)
# # A tibble: 4 x 4
# ID diagnosis val date
# <dbl> <chr> <dbl> <chr>
# 1 1 cancer 1 Feb2017
# 2 1 diabetes 1 Jun2015
# 3 2 stroke 1 Jan2015
# 4 2 diabetes 1 Jun2015
library(data.table)
data <- data.table(ID = c(1, 2), cancer = c(1, 0), cancer_date = c("Feb2017", "Feb2014"), stroke = c(0, 1), stroke_date = c("Jan2015", "Jan2015"), diabetes = c(1, 1), diabetes_date = c("Jun2015", "Jun2015"))
datawide <-
melt(data, id.vars = c("ID", "cancer", "stroke", "diabetes"),
measure.vars = c("cancer_date", "stroke_date", "diabetes_date"))
datawide[(cancer == 1 & variable == "cancer_date") |
(stroke == 1 & variable == "stroke_date") |
(diabetes == 1 & variable == "diabetes_date"), .(ID, condition = variable, date = value)]
Try this solution using pivot_longer() and a flag variable to filter the desired states. After pivoting you can filter the values different to zero and only choose the one values. Here the code:
library(tidyverse)
#Code
df2 <- df %>% pivot_longer(cols = -c(ID,contains('_'))) %>%
filter(value!=0) %>% rename(condition=name) %>% select(-value) %>%
pivot_longer(-c(ID,condition)) %>%
separate(name,c('v1','v2'),sep='_') %>%
mutate(Flag=ifelse(condition==v1,1,0)) %>%
filter(Flag==1) %>% select(-c(v1,v2,Flag)) %>%
rename(date=value)
Output:
# A tibble: 4 x 3
ID condition date
<int> <chr> <chr>
1 1 cancer Feb2017
2 1 diabetes Jun2015
3 2 stroke Jan2015
4 2 diabetes Jun2015
Some data used:
#Data
df <- structure(list(ID = 1:2, cancer = 1:0, cancer_date = c("Feb2017",
"Feb2014"), stroke = 0:1, stroke_date = c("Jan2015", "Jan2015"
), diabetes = c(1L, 1L), diabetes_date = c("Jun2015", "Jun2015"
)), class = "data.frame", row.names = c(NA, -2L))
If the first obtain is complex, here another choice:
#Code 2
df2 <- df %>% mutate(across(everything(),~as.character(.))) %>%
pivot_longer(cols = -c(ID)) %>%
separate(name,c('condition','v2'),sep = '_') %>%
replace(is.na(.),'val') %>%
pivot_wider(names_from = v2,values_from=value) %>%
filter(val==1) %>% select(-val)
Output:
# A tibble: 4 x 3
ID condition date
<chr> <chr> <chr>
1 1 cancer Feb2017
2 1 diabetes Jun2015
3 2 stroke Jan2015
4 2 diabetes Jun2015

How do I select column based on value in another column with dplyr?

My data frame looks like this:
id A T C G ref var
1 1 10 15 7 0 A C
2 2 11 9 2 3 A G
3 3 2 31 1 12 T C
I'd like to create two new columns: ref_count and var_count which will have following values:
Value from A column and value from C column, since ref is A and var is C
Value from A column and value from G column, since ref is A and var is G
etc.
So I'd like to select a column based on the value in another column for each row.
Thanks!
We can use pivot_longer to reshape into 'long' format, filter the rows and then reshape it to 'wide' format with pivot_wider
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(cols = A:G) %>%
group_by(id) %>%
filter(name == ref|name == var) %>%
mutate(nm1 = c('ref_count', 'var_count')) %>%
ungroup %>%
select(id, value, nm1) %>%
pivot_wider(names_from = nm1, values_from = value) %>%
left_join(df1, .)
# A tibble: 3 x 9
# id A T C G ref var ref_count var_count
#* <int> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
#1 1 10 15 7 0 A C 10 7
#2 2 11 9 2 3 A G 11 3
#3 3 2 31 1 12 T C 31 1
Or in base R, we can also make use of the vectorized row/column indexing
df1$refcount <- as.matrix(df1[2:5])[cbind(seq_len(nrow(df1)), match(df1$ref, names(df1)[2:5]))]
df1$var_count <- as.matrix(df1[2:5])[cbind(seq_len(nrow(df1)), match(df1$var, names(df1)[2:5]))]
data
df1 <- structure(list(id = 1:3, A = c(10, 11, 2), T = c(15, 9, 31),
C = c(7, 2, 1), G = c(0, 3, 12), ref = c("A", "A", "T"),
var = c("C", "G", "C")), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"))
The following is a tidyverse alternative without creating a long dataframe that needs filtering. It essentially uses tidyr::nest() to nest the dataframe by rows, after which the correct column can be selected for each row.
df1 %>%
nest(data = -id) %>%
mutate(
data = map(
data,
~mutate(., refcount = .[[ref]], var_count = .[[var]])
)
) %>%
unnest(data)
#> # A tibble: 3 × 9
#> id A T C G ref var refcount var_count
#> <int> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
#> 1 1 10 15 7 0 A C 10 7
#> 2 2 11 9 2 3 A G 11 3
#> 3 3 2 31 1 12 T C 31 1
A variant of this does not need the (assumed row-specific) id column but defines the nested groups from the unique values of ref and var directly:
df1 %>%
nest(data = -c(ref, var)) %>%
mutate(
data = pmap(
list(data, ref, var),
function(df, ref, var) {
mutate(df, refcount = df[[ref]], var_count = df[[var]])
}
)
) %>%
unnest(data)
The data were specified by akrun:
df1 <- structure(list(id = 1:3, A = c(10, 11, 2), T = c(15, 9, 31),
C = c(7, 2, 1), G = c(0, 3, 12), ref = c("A", "A", "T"),
var = c("C", "G", "C")), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"))

Creating an interval in for frequency table in R

I have a dataframe I've created in the form
FREQ CNT
0 5
1 20
2 1000
3 3
4 3
I want to further group my results to be in the following form:
CUT CNT
0+1 25
2+3 1003
4+5 ...
.....
I've tried using the between and cut functions in dplyr but it just adds a new interval column to my dataframe can anyone give me a good indication as to where to go to achieve this?
Here is a way to do it in dplyr:
library(dplyr)
df <- df %>%
mutate(id = 1:n()) %>%
mutate(new_freq = ifelse(id %% 2 != 0, paste0(FREQ, "+", lead(FREQ, 1)), paste0(lag(FREQ, 1), "+", FREQ)))
df <- df %>%
group_by(new_freq) %>%
mutate(new_cnt = sum(CNT))
unique(df[, 4:5])
# A tibble: 2 x 2
# Groups: new_freq [2]
# new_freq new_cnt
# <chr> <int>
#1 0+1 25
#2 2+3 1003
data
df <- structure(list(FREQ = 0:3, CNT = c(5L, 20L, 1000L, 3L)), class = "data.frame", row.names = c(NA, -4L))
A non-elegant solution using dplyr... probably a better way to do this.
dat <- data.frame(FREQ = c(0,1,2,3,4), CNT = c(5,20,1000, 3, 3))
dat2 <- dat %>%
mutate(index = 0:(nrow(dat)-1)%/%2) %>%
group_by(index)
dat2 %>%
summarise(new_CNT = sum(CNT)) %>%
left_join(dat2 %>%
mutate(CUT = paste0(FREQ[1], "+", FREQ[2])) %>%
distinct(index, CUT),
by = "index") %>%
select(-index)
# A tibble: 3 x 2
new_CNT CUT
<dbl> <chr>
1 25 0+1
2 1003 2+3
3 3 4+NA

Resources