Remove duplicate lines while keeping the bottom lines - r

I would like to remove duplicate lines in R keeping the information of the lower lines, that is, from this data:
example <- structure(list(var1 = c(1, 1, 2, 2, 3, 4, 5, 6, 6), var2 = c(0,
0, 0, 0, 0, 0, 0, 0, 0), var3 = c(1, 0, 0, 0, 0, 1, 0, 0, 0),
var4 = c(1, 1, 1, 1, 0, 1, 1, 0, 0), var5 = c(1, 1, 1, 0,
0, 1, 1, 0, 0), Year = 2001:2009), row.names = c(NA, -9L), class = "data.frame")
I would like to remove the duplicates keeping the lines at the bottom, so that I get:
example1 <- structure(list(var1 = c(1, 2, 3, 4, 5, 6), var2 = c(0, 0, 0,
0, 0, 0), var3 = c(0, 0, 0, 1, 0, 0), var4 = c(1, 1, 0, 1, 1,
0), var5 = c(1, 0, 0, 1, 1, 0), Year = c(2002, 2004, 2005, 2006,
2007, 2009)), row.names = c(NA, -6L), class = "data.frame")
Is it possible to apply the duplicated function or the distinct function of the `dplyr package?
I appreciate any help. Thanks.

Is this what you want?
example %>%
group_by(var1) %>%
slice_tail()
output
# A tibble: 6 x 6
# Groups: var1 [6]
var1 var2 var3 var4 var5 Year
<dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 1 0 0 1 1 2002
2 2 0 0 1 0 2004
3 3 0 0 0 0 2005
4 4 0 1 1 1 2006
5 5 0 0 1 1 2007
6 6 0 0 0 0 2009

The #ThomasIsCoding response, with the dplyr tools, worked well. I found another possibility, which seems faster:
example1 <- example[!duplicated(example$var1, fromLast = T), ]

distinct keeps 1st row in each group, if you want to keep last row you can reverse the rows and then apply distinct.
library(dplyr)
example %>%
slice(n():1) %>%
distinct(var1, .keep_all = TRUE) %>%
arrange(var1)
# var1 var2 var3 var4 var5 Year
#1 1 0 0 1 1 2002
#2 2 0 0 1 0 2004
#3 3 0 0 0 0 2005
#4 4 0 1 1 1 2006
#5 5 0 0 1 1 2007
#6 6 0 0 0 0 2009
Alternatively you can also use slice :
example %>% group_by(var1) %>% slice(n())

Related

Count occurrences in specific column ranges and return factor variable, R

I have data like this:
df<-structure(list(levels_incised___1 = c(0, 0, 0, 0, 0, 0), levels_incised___2 = c(1,
0, 0, 0, 0, 0), levels_incised___3 = c(1, 0, 0, 0, 0, 0), levels_incised___4 = c(1,
0, 0, 0, 0, 0), levels_incised___5 = c(1, 0, 0, 0, 0, 0), levels_incised___6 = c(1,
0, 0, 0, 0, 0), levels_incised___7 = c(1, 0, 0, 0, 0, 0), levels_incised___8 = c(1,
1, 1, 0, 0, 0), levels_incised___9 = c(1, 1, 1, 0, 0, 0), levels_incised___10 = c(1,
1, 1, 0, 0, 0), levels_incised___11 = c(0, 1, 0, 0, 0, 0), levels_incised___12 = c(0,
1, 0, 0, 0, 0), levels_incised___13 = c(0, 1, 0, 0, 0, 0), levels_incised___14 = c(0,
1, 0, 0, 0, 0), levels_incised___15 = c(0, 1, 0, 0, 0, 0), levels_incised___16 = c(0,
0, 0, 0, 0, 0), levels_incised___17 = c(0, 0, 0, 0, 0, 0), levels_incised___18 = c(0,
0, 0, 0, 0, 0), levels_incised___19 = c(0, 0, 0, 0, 0, 0), levels_incised___20 = c(0,
0, 0, 0, 0, 0), levels_incised___21 = c(0, 0, 0, 0, 0, 0), levels_incised___22 = c(0,
0, 0, 0, 1, 0), levels_incised___23 = c(0, 0, 0, 0, 1, 1), levels_incised___24 = c(0,
0, 0, 0, 1, 1), levels_incised___25 = c(0, 0, 0, 0, 1, 1), levels_incised___26 = c(0,
0, 0, 0, 1, 1), levels_incised___27 = c(0, 0, 0, 1, 1, 1), levels_incised___28 = c(0,
0, 0, 1, 1, 1), levels_incised___29 = c(0, 0, 0, 1, 1, 0), levels_incised___30 = c(0,
0, 0, 1, 1, 0), levels_incised___31 = c(0, 0, 0, 0, 0, 0)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
That originally came from this Redcap input where each button was one of those columns:
And I need to create a column at the end (lets call it Level) with these possible inputs:
Cervical (any of the c buttons)
Thoracic (the t's)
Lumbar (the L's)
Sacral (sacral)
Thoracocervical (t's or c's)
Thoracolumbar (t's or l's)
Lumbosacral (l's and sacral)
So for instance, the patient in the first row had "1"'s in levels_incised_2 through levels_incised_10... meaning they had values in both the cervical range and the thoracic range. So that patient should get "Thoracocervical".
The patient in row 2 had 1's in 8 through 15, so they'd only get a "thoracic"
Does anyone know the most straight forward way to accomplish this?
Oh one last detail, there's 100+ other columns so it'd be nice if I could select/name these specific ones to count
A few things to resolve here:
find a way to convert levels...# to one of the C/T/... categories;
produce logic to infer based on presence of groups.
I think the first can be done by extracting the number and using findInterval to determine with of C/T/... each column belongs to. From there, we can do some simple c_across to find "any" in a group, and case_when to get your Level labels.
library(dplyr)
# helper function for renaming
func <- function(z) {
num <- as.integer(gsub("\\D", "", z))
grp <- c("C","T","L","S","Co","unclear")[findInterval(num, 1+c(0, 7, 19, 24, 29, 30, 31))]
grp <- paste0(grp, ave(grp, grp, FUN = seq_along))
# fix those that do not need numbering
grp[grepl("^Co", grp)] <- "Co"
grp[grepl("^unc", grp)] <- "unclear"
grp
}
out <- df %>%
rename_with(.cols = starts_with("levels"), .fn = func) %>%
rowwise() %>%
mutate(
anyC = sum(c_across(C1:C7)) > 0,
anyT = sum(c_across(T1:T12)) > 0,
anyL = sum(c_across(L1:L5)) > 0,
anyS = sum(c_across(S1:S5)) > 0
) %>%
ungroup() %>%
mutate(
Level = case_when(
anyC & anyT & anyL ~ "More than 2?",
anyL & anyS ~ "Lumbosacral",
anyT & anyL ~ "Thoracolumbar",
anyT & anyC ~ "Thoracocervical",
anyS ~ "Sacral",
anyL ~ "Lumbar",
anyT ~ "Thoracic",
anyC ~ "Cervical",
TRUE ~ "Nothing?"
)
)
out
# # A tibble: 6 x 36
# C1 C2 C3 C4 C5 C6 C7 T1 T2 T3 T4 T5 T6 T7 T8 T9 T10 T11 T12 L1 L2 L3 L4 L5 S1 S2 S3 S4 S5 Co unclear anyC anyT anyL anyS Level
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl> <lgl> <lgl> <lgl> <chr>
# 1 0 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 TRUE TRUE FALSE FALSE Thoracocervical
# 2 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 FALSE TRUE FALSE FALSE Thoracic
# 3 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 FALSE TRUE FALSE FALSE Thoracic
# 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 FALSE FALSE FALSE TRUE Sacral
# 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 FALSE FALSE TRUE TRUE Lumbosacral
# 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 FALSE FALSE TRUE TRUE Lumbosacral
out$Level
# [1] "Thoracocervical" "Thoracic" "Thoracic" "Sacral" "Lumbosacral" "Lumbosacral"
If you don't want to keep the renaming, then you can combine the Level result to your original frame with cbind(df, Level = out$Level).
using package dplyr:
## vertebra codes needed later on
vertebra_codes <- c(
paste0('C',1:7), paste0('T',1:12),
paste0('L',1:5), paste0('S',1:5),
'X', ## for Coccyx
'-' ## for unknown
)
df %>%
mutate(
## assuming each row is a case:
case_id = paste0('case_',row_number())
) %>%
## reshape the data from wide to long format:
pivot_longer(
cols = -case_id,
names_to = 'level_incised', values_to = 'is_incised'
) %>%
mutate(
## remove the redundant 'levels_incised__' prefix:
level_incised = gsub('.*_','',level_incised),
## assign the vertebra corresponding to 'level':
vertebra = vertebra_codes[as.integer(level_incised)],
## assign the spine region (e.g.: all lumbal vert. start with 'L'
spine_region = substr(vertebra,1,1)
) %>%
filter(is_incised == 1) %>% ## we're interested in incised vert. only
## remove replicates (more than one vertebra per spine region affected:
distinct(case_id, spine_region) %>%
## do the counts per case:
group_by(case_id) %>%
## string together the affected regions per case:
summarise(incised_regions = paste(spine_region, collapse = ','))
result:
# A tibble: 6 x 2
case_id incised_regions
<chr> <chr>
1 case_1 C,T
2 case_2 T
3 case_3 T
4 case_4 S,X
5 case_5 L,S,X
6 case_6 L,S
(Note that original `df` remains unchanged throughout the processing pipeline. However you can break up the pipeline by removing the `%>%` operator and inspect the intermediary steps, or assign them to temporary objects.)
extra / for fun: example code to ggplot the spine with vertebra status (incised or not) per patient.

Create column conditioning the behavior of rows in the dataset

I would like to do something very specific. I have a vast set of data, which, in summary, looks more or less like this, with values 0, 1 and 2:
I need to create a situation variable so that it contains the value 0, 1 and 2.
The value 0 for cases that contain only 0's and 1's in the entire line.
The value 1 for the case where the value 2 appears, but at some point 1 appears before it.
The value 2 for the case where the value 2 appears, but at some point 0 appears before it.
So it's something close to:
structure(list(X1 = c(1, 1, 1, 1, 1, 1, 1, 1, 0, 1), X2 = c(1,
1, 1, 1, 0, 0, 0, 0, 0, 2), X3 = c(0, 1, 1, 1, 1, 0, 0, 1, 0,
0), X4 = c(0, 1, 1, 0, 1, 1, 0, 0, 0, 0), X5 = c(2, 1, 1, 0,
2, 1, 1, 0, 0, 0), X6 = c(2, 1, 1, 0, 2, 1, 1, 0, 0, 0), X7 = c(2,
1, 1, 1, 2, 1, 1, 2, 0, 0), X8 = c(0, 1, 1, 1, 2, 1, 2, 2, 2,
0)), class = "data.frame", row.names = c(NA, 10L))
I wrote a score function and applied it over all the rows of your dataframe.
score <- function(x) {
a <- which(x == 2)
ifelse(length(a) > 0, ifelse(a[1] >=2, 2 - x[a[1] - 1], 1), 0)
}
df <- structure(list(X1 = c(1, 1, 1, 1, 1, 1, 1, 1, 0, 1),
X2 = c(1, 1, 1, 1, 0, 0, 0, 0, 0, 2),
X3 = c(0, 1, 1, 1, 1, 0, 0, 1, 0, 0),
X4 = c(0, 1, 1, 0, 1, 1, 0, 0, 0, 0),
X5 = c(2, 1, 1, 0, 2, 1, 1, 0, 0, 0),
X6 = c(2, 1, 1, 0, 2, 1, 1, 0, 0, 0),
X7 = c(2, 1, 1, 1, 2, 1, 1, 2, 0, 0),
X8 = c(0, 1, 1, 1, 2, 1, 2, 2, 2, 0)),
class = "data.frame", row.names = c(NA, 10L))
df$situation <- sapply(1:nrow(df), function(i) score(as.numeric(df[i,])))
df
Here's a tidyverse approach.
I'll first concatenate all columns together, then use grepl() to look for 12 or 02.
library(tidyverse)
df %>% rowwise() %>%
mutate(concat = paste(c_across(everything()), collapse = "")) %>%
ungroup() %>%
mutate(situation = case_when(
!grepl(2, concat) ~ 0,
grepl("12", concat) ~ 1,
grepl("02", concat) ~ 2
)) %>%
select(-concat)
Output
# A tibble: 10 x 9
X1 X2 X3 X4 X5 X6 X7 X8 situation
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 0 0 2 2 2 0 2
2 1 1 1 1 1 1 1 1 0
3 1 1 1 1 1 1 1 1 0
4 1 1 1 0 0 0 1 1 0
5 1 0 1 1 2 2 2 2 1
6 1 0 0 1 1 1 1 1 0
7 1 0 0 0 1 1 1 2 1
8 1 0 1 0 0 0 2 2 2
9 0 0 0 0 0 0 0 2 2
10 1 2 0 0 0 0 0 0 1
Note that this solution assumes that:
2 will not appear in the first column
1 or 2 in the situation is defined by the number immediately before 2 in your dataset
There will not be a case of 12 and 02 happening in the same row

Cumulative count for a column using R

I got data like this
structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2), drug_1 = c(0,
0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1), drug_2 = c(0, 1, 1, 1, 1, 0,
1, 0, 0, 1, 0, 1)), class = "data.frame", row.names = c(NA, -12L
))
I would like to get the cumulative count of each column for each id and get the data like this
structure(list(id2 = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2), drug_1_b = c(0,
0, 0, 0, 0, 1, 2, 0, 0, 1, 0, 2), drug_2_b = c(0, 1, 2, 3, 4,
0, 5, 0, 0, 1, 0, 2)), class = "data.frame", row.names = c(NA,
-12L))
You can get a cumulative sum with cumsum.
To split data.frame into subsets, you can use split and then lapply cumsum over the list of the data.frames and again over the list of the columns, or you can use the ave function which does exactly that:
data = structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2), drug_1 = c(0,
0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1), drug_2 = c(0, 1, 1, 1, 1, 0,
1, 0, 0, 1, 0, 1)), class = "data.frame", row.names = c(NA, -12L
))
data[-1] = ave(data[-1], data$id, FUN=cumsum)
edit:
I assumed that the cumulative sum is requested (as per instructions) and that there is a mistake in the example data. If the example data is correct, then the condition is If the count is zero, don't do cumulative sum and leave at zero or ifelse(x == 0, 0, cumsum(x)) (as per #r2evans). However, this construct doesn't work when applied for the data.frame. A more complex helper function is required:
data[-1] = ave(data[-1], data$id, FUN=function(x){
y = cumsum(x)
y[x == 0] = 0
y
})
We can now compare it with the requested (renamed) data:
result = structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2), drug_1 = c(0,
0, 0, 0, 0, 1, 2, 0, 0, 1, 0, 2), drug_2 = c(0, 1, 2, 3, 4,
0, 5, 0, 0, 1, 0, 2)), class = "data.frame", row.names = c(NA,
-12L))
identical(data, result)
Base R,
ave(df$drug_2, df$id, FUN = function(z) ifelse(z == 0, z, cumsum(z)))
# [1] 0 1 2 3 4 0 5 0 0 1 0 2
Edit Simplified the solution after reading r2evans' approach.
You could use
library(dplyr)
df %>%
group_by(id) %>%
mutate(across(starts_with("drug"),
~ifelse(.x == 0, 0, cumsum(.x)))) %>%
ungroup()
This returns
# A tibble: 12 x 3
id drug_1 drug_2
<dbl> <dbl> <dbl>
1 1 0 0
2 1 0 1
3 1 0 2
4 1 0 3
5 1 0 4
6 1 1 0
7 1 2 5
8 2 0 0
9 2 0 0
10 2 1 1
11 2 0 0
12 2 2 2
Base R solution:
# Resolve the names of vectors we want to cumulatively sum:
# drug_vec_names => character vector
drug_vec_names <- grep( "^drug\\_", colnames(df), value = TRUE)
# Resolve the names of vectors we want to keep:
# not_drug_vec_names => character vector
not_drug_vec_names <- names(df)[!(names(df) %in% drug_vec_names)]
# Calculate the result: res => data.frame
res <- setNames(
cbind(
df[,not_drug_vec_names],
replace(
ave(
df[,drug_vec_names],
df[,not_drug_vec_names],
FUN = cumsum
),
df[,drug_vec_names] == 0,
0
)
),
c(not_drug_vec_names, drug_vec_names)
)
If you have binary values (1/0) in drug columns, you can multiply the cumulative sum with itself to get 0 for 0 values.
library(dplyr)
df %>%
group_by(id) %>%
mutate(across(starts_with('drug'), ~cumsum(.) * .)) %>%
ungroup
# id drug_1 drug_2
# <dbl> <dbl> <dbl>
# 1 1 0 0
# 2 1 0 1
# 3 1 0 2
# 4 1 0 3
# 5 1 0 4
# 6 1 1 0
# 7 1 2 5
# 8 2 0 0
# 9 2 0 0
#10 2 1 1
#11 2 0 0
#12 2 2 2

Converting binary to list

I have somating like a binary dataframe
> dput(head(dat))
structure(list(CDR3.aa = c("CALWEVQELGKKIKVF", "CAATVGGWGKLQF",
"CACDPLYGGITGGFNTDKLIF", "CACDTLLPTSLGDMAKLIF", "CALGELSSDGGGAIF",
"CALSNTGGFKTIF"), TCR_CS001_T1 = c(1, 1, 1, 1, 1, 0), TCR_CS001_T2 = c(0,
1, 1, 1, 1, 0), TCR_CS002 = c(1, 0, 0, 0, 0, 0), TCR_HC002 = c(0,
0, 0, 0, 0, 1), TCR_HC003 = c(1, 0, 0, 0, 0, 1)), row.names = c(NA,
-6L), .internal.selfref = <pointer: 0x0000023f7a101ef0>, class = c("immunr_public_repertoire",
"data.table", "data.frame"))
That shows if an amin acide exists in a sample we see 1 and if absent shown by 0
I want to replace 1 and 0 by amino acid itself
How I can do that please?
If CDR3.aa is the amino acid column you can do :
dplyr :
library(dplyr)
dat %>% mutate(across(-CDR3.aa, ~ifelse(. == 1, CDR3.aa, .)))
# CDR3.aa TCR_CS001_T1 TCR_CS001_T2 TCR_CS002
#1: CALWEVQELGKKIKVF CALWEVQELGKKIKVF 0 CALWEVQELGKKIKVF
#2: CAATVGGWGKLQF CAATVGGWGKLQF CAATVGGWGKLQF 0
#3: CACDPLYGGITGGFNTDKLIF CACDPLYGGITGGFNTDKLIF CACDPLYGGITGGFNTDKLIF 0
#4: CACDTLLPTSLGDMAKLIF CACDTLLPTSLGDMAKLIF CACDTLLPTSLGDMAKLIF 0
#5: CALGELSSDGGGAIF CALGELSSDGGGAIF CALGELSSDGGGAIF 0
#6: CALSNTGGFKTIF 0 0 0
# TCR_HC002 TCR_HC003
#1: 0 CALWEVQELGKKIKVF
#2: 0 0
#3: 0 0
#4: 0 0
#5: 0 0
#6: CALSNTGGFKTIF CALSNTGGFKTIF
data.table :
library(data.table)
dat[, (names(dat)[-1]) := lapply(.SD, function(x) ifelse(x == 1, CDR3.aa, x)), .SDcols = -1]

compare rows and fill gaps with previous values in a data table

I have data table that looks like this:
library(data.table)
data <- data.table(time = c(seq(0, 14)),
anom = c(0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0),
gier = c(0, 0, 0, 4, 9, 7, 2, 0, 3, 1, 4, 2, 0, 0, 0))
Now I want to fill the gaps (zeros) with ones in column anom so that the result looks like this:
res <- data.table(time = c(seq(0, 14)),
anom = c(0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0),
gier = c(0, 0, 0, 4, 9, 7, 2, 0, 3, 1, 4, 2, 0, 0, 0))
In addition there is the problem that I want to be flexible with the gap size so I can choose how big the gap can be. There must be an easy way to do something similar to LOCF only for real values (maybe filling it with the previous value of the row and not only ones or zeros) and not only for NA's like the functions fill or na.locf.
An example using the maxgap argument to select the maximum gap size
library(zoo)
na.fill(
na.locf(
replace(data$anom,data$anom==0,NA),
na.rm=F,
maxgap=2
),0
)
[1] 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0
Here is another option using rolling join:
maxgap <- 1L
data[, c("rn", "lu") := .(.I, anom)]
data[anom==0L, lu := fcoalesce(
data[anom!=0L][.SD, on=.(rn=rn), roll=maxgap, rollends=c(FALSE, FALSE), x.anom],
anom)
]
output:
time anom gier rn lu
1: 0 0 0 1 0
2: 1 0 0 2 0
3: 2 0 0 3 0
4: 3 1 4 4 1
5: 4 1 9 5 1
6: 5 1 7 6 1
7: 6 0 2 7 1
8: 7 0 0 8 0
9: 8 1 3 9 1
10: 9 0 1 10 1
11: 10 1 4 11 1
12: 11 1 2 12 1
13: 12 0 0 13 0
14: 13 0 0 14 0
15: 14 0 0 15 0

Resources