Count number of occurrences of two column cases - r

I have a dataframe:
ID col1 col2
1 LOY A
2 LOY B
3 LOY B
4 LOY B
5 LOY A
I want to count number of occurrences of unique values according to col1 and col2. So, desired result is:
event count
loy-a 2
loy-b 3
How could i do that?

You can also try:
library(dplyr)
#Code
new <- df %>% group_by(event=tolower(paste0(col1,'-',col2))) %>%
summarise(count=n())
Output:
# A tibble: 2 x 2
event count
<chr> <int>
1 loy-a 2
2 loy-b 3
Some data used:
#Data
df <- structure(list(ID = 1:5, col1 = c("LOY", "LOY", "LOY", "LOY",
"LOY"), col2 = c("A", "B", "B", "B", "A")), class = "data.frame", row.names = c(NA,
-5L))

Here is an option where we convert the columns to lower case, then get the count and unite the 'col1', 'col2' to a single 'event' column
library(dplyr)
library(tidyr)
df1 %>%
mutate(across(c(col1, col2), tolower)) %>%
count(col1, col2) %>%
unite(event, col1, col2, sep='-')
-output
# event n
#1 loy-a 2
#2 loy-b 3
NOTE: Returns the OP's expected output
Or using base R
with(df1, table(tolower(paste(col1, col2, sep='-'))))
data
df1 <- structure(list(ID = 1:5, col1 = c("LOY", "LOY", "LOY", "LOY",
"LOY"), col2 = c("A", "B", "B", "B", "A")),
class = "data.frame", row.names = c(NA,
-5L))

Related

Join of column values for specific row values

I'd like to join (left_join) a tibble (df2) to another one (df1) only where the value of col2 in df1 is NA. I am currently using a code that is not very elegant. Any advice on how to shorten the code would be greatly appreciated!
library(tidyverse)
# df1 contains NAs that need to be replaced by values from df2, for relevant col1 values
df1 <- tibble(col1 = c("a", "b", "c", "d"), col2 = c(1, 2, NA, NA), col3 = c(10, 20, 30, 40))
df2 <- tibble(col1 = c("a", "b", "c", "d"), col2 = c(5, 6, 7, 8), col3 = c(50, 60, 70, 80))
# my current approach
df3 <- df1 %>%
filter(!is.na(col2))
df4 <- df1 %>%
filter(is.na(col2)) %>%
select(col1)%>%
left_join(df2)
# output tibble that is expected
df_final <- df3 %>%
bind_rows(df4)
Here's a small dplyr answer that works for me, although it might get slow if you have tons of rows:
df1 %>%
filter(is.na(col2)) %>%
select(col1) %>%
left_join(df2, by = "col1") %>%
bind_rows(df1, .) %>%
filter(!is.na(col2))
We can use data.table methods
library(data.table)
setDT(df1)[setDT(df2), col2 := fcoalesce(col2, i.col2), on = .(col1)]
-output
> df1
col1 col2 col3
1: a 1 10
2: b 2 20
3: c 7 30
4: d 8 40
Or an option with tidyverse
library(dplyr)
library(stringr)
df1 %>%
left_join(df2, by = c("col1")) %>%
transmute(col1, across(ends_with(".x"),
~ coalesce(., get(str_replace(cur_column(), ".x", ".y"))),
.names = "{str_remove(.col, '.x')}"))
-output
# A tibble: 4 x 3
col1 col2 col3
<chr> <dbl> <dbl>
1 a 1 10
2 b 2 20
3 c 7 30
4 d 8 40

How fill a dataframe from another one in R?

I want to fill df2 with information from df1.
df1 as below
ID Mutation
1 A
2 B
2 C
3 A
df2 as below
ID A B C
1
2
3
For example, if mutation A is found in ID 1, then I want it in df2 it marked as "Y".
So the df2 result should be
ID A B C
1 Y
2 Y Y
3 Y
I have hundreds of IDs and more than 20 mutations. How can I efficiently achieve this in R? Thanks!
Using data.table you can try
setDT(df)
df2 <- dcast(df,formula = ID~Mutation )
df2[, c("A", "B", "C") := lapply(.SD, function(x) ifelse(is.na(x), " ", "Y")), ID]
df2
#Output
ID A B C
1: 1 Y
2: 2 Y Y
3: 3 Y
Create a new column with value 'Y' and cast the data in wide format.
library(dplyr)
library(tidyr)
df %>%
mutate(value = 'Y') %>%
pivot_wider(names_from = Mutation, values_from = value, values_fill = '')
# ID A B C
# <int> <chr> <chr> <chr>
#1 1 "Y" "" ""
#2 2 "" "Y" "Y"
#3 3 "Y" "" ""
data
df <- structure(list(ID = c(1L, 2L, 2L, 3L), Mutation = c("A", "B",
"C", "A")), class = "data.frame", row.names = c(NA, -4L))

How to Classify data frame Based on a Columns in R? [duplicate]

This question already has answers here:
Assign unique ID based on two columns [duplicate]
(2 answers)
Closed 3 years ago.
I have a data frame and has columns like this:
gene col1 col2 type
------------------------------
gene_1 a b 1
gene_2 aa bb 2
gene_3 a b 1
gene_4 aa bb 2
I want to find the column "type" using column "col2" and "col1". so I need a classification based on "col2" and "col1". how should I do this in R?
thanks a lot
Based. on the output, an option is to create group indices from columns 'col1', and 'col2'
library(dplyr)
df1 %>%
mutate(type = group_indices(., col1, col2))
#. gene col1 col2 type
#1 gene_1 a b 1
#2 gene_2 aa bb 2
#3 gene_3 a b 1
#4 gene_4 aa bb 2
If there are multiple names, then one option is to convert the string column names to symbols and then evaluate (!!!)
df1 %>%
mutate(type = group_indices(., !!! rlang::syms(names(.)[2:3])))
Or in data.table
library(data.table)
setDT(df1)[, type := .GRP, .(col1, col2)]
data
df1 <- structure(list(gene = c("gene_1", "gene_2", "gene_3", "gene_4"
), col1 = c("a", "aa", "a", "aa"), col2 = c("b", "bb", "b", "bb"
), type = c(1L, 2L, 1L, 2L)), class = "data.frame", row.names = c(NA,
-4L))

Create new column of character vectors

I'm trying to combine two columns of type "character" into a new column. That is,
ColA ColB ColC
"A" "1" c("A", "1")
"B" "2" c("B", "2")
"C" "3" c("C", "3")
I have tried:
df %>%
mutate(ColC = list(ColA, ColB))
and other variants but this doesn't work. Anyone know how to do this?
A simple paste would do the job in this example
df=data.frame(colA=c("A","B","C"), colB=c("1","2","3"))
df$ColC=paste(df$colA, df$colB)
df
colA colB ColC
1 A 1 A 1
2 B 2 B 2
3 C 3 C 3
We can user rowwise
library(tidyverse)
df %>%
rowwise() %>%
mutate(ColC = list(c(.)))
Or using pmap
df %>%
mutate(ColC = pmap(., ~ c(...)))
data
df <- structure(list(ColA = c("A", "B", "C"), ColB = 1:3),
class = "data.frame", row.names = c(NA, -3L))
If you do not want to use dplyr df$ColC <- apply(df[,c("ColA", "ColB")], 1, paste, collapse = " ").

R paste0 2 columns if not NA

I would like to paste0 two columns if the element in one column is not NA.If one element of one columns is NA then keep the element of the other column only.
structure(list(col1 = structure(1:3, .Label = c("A", "B", "C"),
class = "factor"), col2 = c(1, NA, 3)), .Names = c("col1", "col2"),
class = "data.frame",row.names = c(NA, -3L))
# col1 col2
# 1 A 1
# 2 B NA
# 3 C 3
structure(list(col1 = structure(1:3, .Label = c("A", "B", "C"),
class = "factor"),col2 = c(1, NA, 3), col3 = c("A|1", "B", "C|3")),
.Names = c("col1", "col2", "col3"), row.names = c(NA,-3L),
class = "data.frame")
# col1 col2 col3
#1 A 1 A|1
#2 B NA B
#3 C 3 C|3
you can also do it with regular expressions:
df$col3 <- sub("NA\\||\\|NA", "", with(df, paste0(col1, "|", col2)))
That is, paste them in regular way and then replace any "NA|" or "|NA" with "". Note that | needs to be "double escaped" because it means "OR" in regexps, that's why the strange pattern NA\\||\\|NA means actually "NA|" OR "|NA".
As #Roland says, this is easy using ifelse (just translate the mental logic into a series of nested ifelse statements):
x <- transform(x,col3=ifelse(is.na(col1),as.character(col2),
ifelse(is.na(col2),as.character(col1),
paste0(col1,"|",col2))))
update: need as.character in some cases.
Try:
> df$col1 = as.character(df$col1)
> df$col3 = with(df, ifelse(is.na(col1),col2, ifelse(is.na(col2), col1, paste0(col1,'|',col2))))
> df
col1 col2 col3
1 A 1 A|1
2 B NA B
3 C 3 C|3
You could also do:
library(stringr)
df$col3 <- apply(df, 1, function(x)
paste(str_trim(x[!is.na(x)]), collapse="|"))
df
# col1 col2 col3
#1 A 1 A|1
#2 B NA B
#3 C 3 C|3

Resources