I have this condition:
Animal Date.1 Weight.1 Date.2 Weight.2 Date.3 Weight.3 Date.4 Weight.4
1 12/18/19 55 1/2/20 67 6/6/20 101
2 12/18/19 64 1/3/20 69 2/4/20 80
3 12/18/19 75
4 1/3/20 85
5 12/18/19 88 1/6/20 86 2/7/20 96 6/6/20 100
And I would like to select the last weight after weight.1, like this:
Animal Date.last Last Weight
1 6/6/20 101
2 2/4/20 80
3 NA NA
4 1/3/20 85
5 6/6/20 100
Sorry, I didn't show any scripts but I didn't even know where to start.
Here is an option after reshaping to 'long' format
library(dplyr)
library(tidyr)
library(stringr)
df1 %>%
pivot_longer(cols = -Animal, names_to = c(".value", "group"),
names_sep="[.]", values_drop_na = TRUE) %>%
group_by(Animal) %>%
slice(n()) %>%
ungroup %>%
mutate_at(vars(Date, Weight), ~ replace(., group == 1, NA)) %>%
select(-group) %>%
rename_at(2:3, ~ str_c(., 'last'))
# A tibble: 5 x 3
# Animal Datelast Weightlast
# <int> <chr> <int>
#1 1 6/6/20 101
#2 2 2/4/20 80
#3 3 <NA> NA
#4 4 1/3/20 85
#5 5 6/6/20 100
data
df1 <- structure(list(Animal = 1:5, Date.1 = c("12/18/19", "12/18/19",
"12/18/19", NA, "12/18/19"), Weight.1 = c(55L, 64L, 75L, NA,
88L), Date.2 = c("1/2/20", "1/3/20", NA, "1/3/20", "1/6/20"),
Weight.2 = c(67L, 69L, NA, 85L, 86L), Date.3 = c(NA, "2/4/20",
NA, NA, "2/7/20"), Weight.3 = c(NA, 80L, NA, NA, 96L),
Date.4 = c("6/6/20",
NA, NA, NA, "6/6/20"), Weight.4 = c(101L, NA, NA, NA, 100L
)), class = "data.frame", row.names = c(NA, -5L))
in base R, you could use the aggregate +reshape functions:
df1 <- reshape(`is.na<-`(df,df==""),2:ncol(df),idvar = "Animal",dir="long")
aggregate(cbind(Date,Weight)~Animal, df1,
function(x)if(is.na(x[2])) NA else as.character(x[max(which(!is.na(x)))]),
na.action = identity)
Animal Date Weight
1 1 6/6/20 101
2 2 2/4/20 80
3 3 <NA> <NA>
4 4 1/3/20 85
5 5 6/6/20 100
Related
I have a df which looks like this
ID X003-APP X005-APP X008-APP X003-COP X004-COP X008-PIN X009-PIN
363 NA NA 1 0 NA 4 5
364 0 2 NA 1 5 1 5
678 0 NA NA 5 NA NA NA
713 1 1 1 1 1 1 1
219 1 2 3 3 NA 4 5
234 NA NA NA 2 3 NA NA
321 2 3 1 NA NA 1 2
I am interested in minimum counts for non-null values across the column substrings APP, COP and PIN. My required output is:
ID APP COP PIN
363 1 1 1
364 1 1 1
678 1 1 0
713 1 1 1
219 1 1 1
234 0 1 0
321 1 0 1
For reference, I am sharing the dput():
structure(list(ID = c(363L, 364L, 678L, 713L, 219L, 234L, 321L),
X003.APP = c(NA, 0L, 0L, 1L, 1L, NA, 2L),
X005.APP = c(NA, 2L, NA, 1L, 2L, NA, 3L),
X008.APP = c(1L, NA, NA, 1L, 3L, NA, 1L),
X003.COP = c(0L, 1L, 5L, 1L, 3L, 2L, NA),
X004.COP = c(NA, 5L, NA, 1L, NA, 3L, NA),
X008.PIN = c(4L, 1L, NA, 1L, 4L, NA, 1L),
X009.PIN = c(5L, 5L, NA, 1L, 5L, NA, 2L)),
class = "data.frame", row.names = c(NA, -7L))
Edit:
Later on, I would like to analyse 2 and 3 sequences across IDs. For example, I am ultimately, interested in minimum counts for non-null values across the column substrings APP, COP and PIN. My ultimate required output for a sequence of length 2 would be:
Spec_1 Spec_2 Counts
APP COP 5
APP PIN 5
COP PIN 4
Or correspondingly, my required output for a sequence of length 3 would be:
Spec_1 Spec_2 Spec_3 Counts
APP COP PIN 4
Is there an easy way to achieve this? It would be great to have a solution that could cater for longer sequences - even beyond 3. Thank you very much for your time.
You may try
library(reshape2)
library(tidyverse)
df %>%
reshape2::melt(id = "ID") %>%
separate(variable, into = c("a", "Spec"), sep = "\\.") %>%
group_by(ID, Spec) %>%
summarize(value = as.numeric(any(!is.na(value)))) %>%
filter(value == 1) %>%
pivot_wider(names_from = "Spec", values_from = "value") %>%
replace(is.na(.), 0)
ID APP COP PIN
<int> <dbl> <dbl> <dbl>
1 219 1 1 1
2 234 0 1 0
3 321 1 0 1
4 363 1 1 1
5 364 1 1 1
6 678 1 1 0
7 713 1 1 1
Is your edited one and
df %>%
reshape2::melt(id = "ID") %>%
separate(variable, into = c("a", "Spec"), sep = "\\.") %>%
group_by(ID, Spec) %>%
summarize(value = any(!is.na(value))) %>%
filter(value) %>%
group_by(ID) %>%
filter(n() > 1) %>%
summarise(Spec = combn(Spec, 2, simplify = F)) %>%
unnest_wider(Spec, names_sep = "_") %>%
group_by(Spec_1, Spec_2) %>%
summarize(Counts = n())
Spec_1 Spec_2 Counts
<chr> <chr> <int>
1 APP COP 5
2 APP PIN 5
3 COP PIN 4
is your previous one.
3 seq?
df %>%
reshape2::melt(id = "ID") %>%
separate(variable, into = c("a", "Spec"), sep = "\\.") %>%
group_by(ID, Spec) %>%
summarize(value = any(!is.na(value))) %>%
filter(value) %>%
group_by(ID) %>%
filter(n() > 2) %>%
summarise(Spec = combn(Spec, 3, simplify = F)) %>%
unnest_wider(Spec, names_sep = "_") %>%
group_by(Spec_1, Spec_2, Spec_3) %>%
summarize(Counts = n())
Spec_1 Spec_2 Spec_3 Counts
<chr> <chr> <chr> <int>
1 APP COP PIN 4
Try this using dplyr
library(dplyr)
df |> rowwise() |> transmute( ID,
APP = case_when(all(is.na(c_across(contains("APP")))) ~ 0 , TRUE ~ 1) ,
COP = case_when(all(is.na(c_across(contains("COP")))) ~ 0 , TRUE ~ 1) ,
PIN = case_when(all(is.na(c_across(contains("PIN")))) ~ 0 , TRUE ~ 1)) -> df1
output
# A tibble: 7 × 4
# Rowwise:
ID APP COP PIN
<int> <dbl> <dbl> <dbl>
1 363 1 1 1
2 364 1 1 1
3 678 1 1 0
4 713 1 1 1
5 219 1 1 1
6 234 0 1 0
7 321 1 0 1
for your second required you can use
df1 |> transmute(AC = case_when(sum(c_across(c(APP,COP))) == 2 ~ 1 , TRUE ~ 0) ,
AP = case_when(sum(c_across(c(APP,PIN))) == 2 ~ 1 , TRUE ~ 0) ,
CP = case_when(sum(c_across(c(PIN,COP))) == 2 ~ 1 , TRUE ~ 0) ,
ACP = case_when(sum(c_across(c(APP,COP,PIN))) == 3 ~ 1 , TRUE ~ 0)) |> ungroup() |>
summarise(APP_COP = sum(AC) , APP_PIN = sum(AP) , COP_PIN = sum(CP) , APP_COP_PIN = sum(ACP))
output
# A tibble: 1 × 4
APP_COP APP_PIN COP_PIN APP_COP_PIN
<dbl> <dbl> <dbl> <dbl>
1 5 5 4 4
Here is a smaple of data set:
df<-read.table (text="Id Name Surname Group A1 A2 A3 Type1 Gen1 B1 B2 B3 Type2 Gen2
116 Matt Dabi A 4 4 4 AB N 3 3 4 G N
116 Matt Dabi D 4 3 2 D N 4 3 2 G N
116 Matt Dabi Q NA NA NA NA NA NA NA NA NA NA
116 Matt Dabi B NA NA NA NA NA NA NA NA NA NA
", header=TRUE)
Id Name Surname Group A1 A2 A3 Type1 Gen1
116 Matt Dabi A 4 4 4 AB N
116 Matt Dabi D 4 3 2 D N
116 Matt Dabi Q 3 3 4 G N
116 Matt Dabi B 4 3 2 G N
I have tried df %>% na.omit()
With multiple patterns, one option is to create a spec with build_longer_spec and then use pivot_longer_spec
library(tidyr)
library(dplyr)
library(stringr)
spec <- df %>%
build_longer_spec(cols = matches("^[A-Za-z]+\\d+$"),
names_to = ".value", names_pattern = ".*(\\d+)") %>%
mutate(`.value` = case_when(str_detect(`.name`, "^[AB]\\d+$") ~
str_c('A', `.value`),
str_detect(`.name`, 'Type') ~ 'Type1',
str_detect(`.name`, 'Gen') ~ 'Gen1'))
pivot_longer_spec(df, spec, values_drop_na = TRUE)
# A tibble: 4 × 9
Id Name Surname Group A1 A2 A3 Type1 Gen1
<int> <chr> <chr> <chr> <int> <int> <int> <chr> <chr>
1 116 Matt Dabi A 4 4 4 AB N
2 116 Matt Dabi A 3 3 4 G N
3 116 Matt Dabi D 4 3 2 D N
4 116 Matt Dabi D 4 3 2 G N
Here another approach using pivot_wider:
df %>%
pivot_longer(starts_with(c("A", "B")), names_to="ID") %>%
na.omit() %>%
mutate(ID=sub("B", "A", ID)) %>%
pivot_wider(names_from=c(ID))
In case of more columns, one can use:
df %>%
pivot_longer(matches("[A-Z][1-9]"), names_to="ID") %>%
na.omit() %>%
mutate(ID = sub("[B-Z]", "A", ID)) %>%
pivot_wider(names_from = ID)
given that the rows are uniquely identified otherwise.
# A tibble: 4 x 7
Id Name Surname Group A1 A2 A3
<int> <chr> <chr> <chr> <int> <int> <int>
1 116 Matt Dabi A 4 4 4
2 116 Matt Dabi D 4 3 2
3 116 Matt Dabi Q 4 3 3
4 116 Matt Dabi B 4 2 4
One option to achieve your desired result would be to make use of tidyr::pivot_longer like so:
library(tidyr)
library(dplyr)
df %>%
pivot_longer(matches("^[A-Z]\\d$"), names_to = c("set", ".value"), names_pattern = "(.)(\\d)$", values_drop_na = TRUE) %>%
select(-set) %>%
rename_with(.fn = ~ paste0("A", .x), .cols = matches("^\\d"))
#> # A tibble: 6 × 7
#> Id Name Surname Group A1 A2 A3
#> <int> <chr> <chr> <chr> <int> <int> <int>
#> 1 116 Matt Dabi A 4 4 4
#> 2 116 Matt Dabi D 4 3 2
#> 3 116 Matt Dabi Q 4 3 3
#> 4 116 Matt Dabi B 4 2 4
#> 5 116 Matt Dabi Q 4 3 3
#> 6 116 Matt Dabi B 4 2 4
DATA
df <- structure(list(Id = c(116L, 116L, 116L, 116L, 116L, 116L), Name = c(
"Matt",
"Matt", "Matt", "Matt", "Matt", "Matt"
), Surname = c(
"Dabi",
"Dabi", "Dabi", "Dabi", "Dabi", "Dabi"
), Group = c(
"A", "D",
"Q", "B", "Q", "B"
), A1 = c(4L, 4L, NA, NA, NA, NA), A2 = c(
4L,
3L, NA, NA, NA, NA
), A3 = c(4L, 2L, NA, NA, NA, NA), B1 = c(
NA,
NA, 4L, 4L, NA, NA
), B2 = c(NA, NA, 3L, 2L, NA, NA), B3 = c(
NA,
NA, 3L, 4L, NA, NA
), C1 = c(NA, NA, NA, NA, 4L, 4L), C2 = c(
NA,
NA, NA, NA, 3L, 2L
), C3 = c(NA, NA, NA, NA, 3L, 4L)), class = "data.frame", row.names = c(
NA,
-6L
))
If my data looks something like this:
species1 species2 info1 info2
Loro Parrot 3 NA
NA Parrot NA 7
Osprey NA NA 89
Sparrow Finch NA 19
Sparrow NA 27 NA
Mallard Duck 69 16
Mallard NA NA NA
NA Swift 25 NA
And i want to merge it together like this:
species1 species2 info1 info2
Loro Parrot 3 7
Osprey NA NA 89
Sparrow Finch 27 19
Mallard Duck 69 16
NA Swift 25 NA
How could i do it, tanking into account that i need to keep the NA records?
Thank you very much! :)
We could use similar approach from the previous post, but in a different way i.e. first create a named vector from the 'species' columns'. Use that to replace the values in first 'species1' column, coalese with the second one to do a grouping and then do the summarise
library(dplyr)
library(tibble)
nm1 <- df1 %>%
select(species1, species2) %>%
na.omit %>%
deframe
df1 %>%
group_by(species = coalesce(nm1[species1], species2)) %>%
summarise(across(everything(), ~ .[complete.cases(.)][1])) %>%
select(-species)
# A tibble: 5 x 4
species1 species2 info1 info2
<chr> <chr> <int> <int>
1 Mallard Duck 69 16
2 Sparrow Finch 27 19
3 Loro Parrot 3 7
4 <NA> Swift 25 NA
5 Osprey <NA> NA 89
data
df1 <- structure(list(species1 = c("Loro", NA, "Osprey", "Sparrow",
"Sparrow", "Mallard", "Mallard", NA), species2 = c("Parrot",
"Parrot", NA, "Finch", NA, "Duck", NA, "Swift"), info1 = c(3L,
NA, NA, NA, 27L, 69L, NA, 25L), info2 = c(NA, 7L, 89L, 19L, NA,
16L, NA, NA)), class = "data.frame", row.names = c(NA, -8L))
You may group by one column and fill the NA values in other one to get the pairs, after which take sum of the values grouping by both the species column.
library(dplyr)
library(tidyr)
df %>%
group_by(species2) %>%
fill(species1) %>%
group_by(species1) %>%
fill(species2) %>%
group_by(species2, .add = TRUE) %>%
summarise(across(.fns = sum, na.rm = TRUE)) %>%
ungroup
# species1 species2 info1 info2
# <chr> <chr> <int> <int>
#1 Loro Parrot 3 7
#2 Mallard Duck 69 16
#3 Osprey NA 0 89
#4 Sparrow Finch 27 19
#5 NA Swift 25 0
Using tidyr >= 1.0.0, one can use tidy selection in the cols argument as follows:
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols=starts_with("DL_TM"),
names_to = "TM",values_to = "DM_TM") %>%
pivot_longer(cols=starts_with("DL_CD"),
names_to = "CD",values_to = "DL_CD") %>%
na.omit() %>%
select(-TM,-CD)
However, the above will quickly get cumbersome(repetitive) with many columns, how can one reduce this to single pivoting?! I have imagined something conceptual like
pivot_longer(cols=starts_with("DL_TM | DL_CD")....) which will obviously not work because tidy selection only works for a single pattern(as far as I know).
Data
df <- structure(list(DL_TM1 = c(16L, 18L, 53L, 59L, 29L, 3L), DL_CD1 = c("AK",
"RB", "RA", "AJ", "RA", "RS"), DL_TM2 = c(5L, 4L, 8L, NA, 1L,
NA), DL_CD2 = c("CN", "AJ", "RB", NA, "AJ", NA), DL_TM3 = c(NA,
NA, 2L, NA, NA, NA), DL_CD3 = c(NA, NA, "AJ", NA, NA, NA), DL_TM4 = c(NA,
NA, NA, NA, NA, NA), DL_CD4 = c(NA, NA, NA, NA, NA, NA), DL_TM5 = c(NA,
NA, NA, NA, NA, NA), DL_CD5 = c(NA, NA, NA, NA, NA, NA), DEP_DELAY_TM = c(21L,
22L, 63L, 59L, 30L, 3L)), class = "data.frame", row.names = c(NA,
-6L))
Expected Output:
Same as the above but with single pivoting.
Based on the response to the comment that this was moved from the code in the question does not actually produce the desired result and what was wanted was the result that this produces:
df %>%
pivot_longer(-DEP_DELAY_TM, names_to = c(".value", "X"),
names_pattern = "(\\D+)(\\d)") %>%
select(-X) %>%
drop_na
giving:
# A tibble: 11 x 3
DEP_DELAY_TM DL_TM DL_CD
<int> <int> <chr>
1 21 16 AK
2 21 5 CN
3 22 18 RB
4 22 4 AJ
5 63 53 RA
6 63 8 RB
7 63 2 AJ
8 59 59 AJ
9 30 29 RA
10 30 1 AJ
11 3 3 RS
Base R
We can alternately do this using base R's reshape. First split the column names (except the last column) by the non-digit parts giving the varying list and then reshape df to long form using that and finally run na.omit to remove the rows with NAs.
nms1 <- head(names(df), -1)
varying <- split(nms1, gsub("\\d", "", nms1))
na.omit(reshape(df, dir = "long", varying = varying, v.names = names(varying)))
giving:
DEP_DELAY_TM time DL_CD DL_TM id
1.1 21 1 AK 16 1
2.1 22 1 RB 18 2
3.1 63 1 RA 53 3
4.1 59 1 AJ 59 4
5.1 30 1 RA 29 5
6.1 3 1 RS 3 6
1.2 21 2 CN 5 1
2.2 22 2 AJ 4 2
3.2 63 2 RB 8 3
5.2 30 2 AJ 1 5
3.3 63 3 AJ 2 3
We can extract the column groupings ("TM" and "CD" in this case), map over each column group to apply pivot_longer to that group, and then full_join the resulting list elements. Let me know if this covers your real-world use case.
suffixes = unique(gsub(".*_(.{2})[0-9]*", "\\1", names(df)))
df.long = suffixes %>%
map(~ df %>%
mutate(id=1:n()) %>% # Ensure unique identification of each original data row
select(id, DEP_DELAY_TM, starts_with(paste0("DL_",.x))) %>%
pivot_longer(cols=-c(DEP_DELAY_TM, id),
names_to=.x,
values_to=paste0(.x,"_value")) %>%
na.omit() %>%
select(-matches(paste0("^",.x,"$")))
) %>%
reduce(full_join) %>%
select(-id)
DEP_DELAY_TM TM_value CD_value
1 21 16 AK
2 21 16 CN
3 21 5 AK
4 21 5 CN
5 22 18 RB
6 22 18 AJ
7 22 4 RB
8 22 4 AJ
9 63 53 RA
10 63 53 RB
11 63 53 AJ
12 63 8 RA
13 63 8 RB
14 63 8 AJ
15 63 2 RA
16 63 2 RB
17 63 2 AJ
18 59 59 AJ
19 30 29 RA
20 30 29 AJ
21 30 1 RA
22 30 1 AJ
23 3 3 RS
I have a relatively large (~100,000 rows) dataset, with multiple rows for each individual. Individuals are identified by 'id'. My goal is to convert to a data frame or data.table with a single row per individual.
For each column, ie. wt:sat, each row would contain an indicator, signifying whether there was at least one non-missing instance of each variable for a given individual.
For example, given the data below:
dat <- structure(list(id = c(386L, 386L, 2794L, 2794L, 2794L, 2794L,
2732L, 2732L), wt = c(56.7, 56.7, NA, NA, NA, NA, 36.3, 36.3),
pain = c(NA, NA, 8L, 8L, NA, NA, NA, NA), sbp = c(120L, NA,
125L, 125L, NA, NA, 120L, 120L), dbp = c(60L, NA, 81L, 81L,
NA, NA, 67L, 67L), hr = c(84L, NA, 100L, 100L, NA, NA, 120L,
120L), rr = c(16L, NA, 18L, 18L, NA, NA, 24L, 24L), sat = c(93L,
NA, NA, NA, NA, NA, 99L, 99L)), row.names = c(NA, -8L), class = "data.frame")
I would like to produce:
I think this is what you are looking for:
A tidyverse solution:
dat %>%
replace(is.na(.), 0) %>%
group_by(id) %>%
summarise_all(~as.numeric(any(. > 0)))
# A tibble: 3 x 8
id wt pain sbp dbp hr rr sat
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 386 1 0 1 1 1 1 1
2 2732 1 0 1 1 1 1 1
3 2794 0 1 1 1 1 1 0
A data.table solution:
dat2 <- dat
setDT(dat2)
dat2[is.na(dat2)] <- 0
dat2[, lapply(.SD, function(x) as.numeric(any(x > 0))), id]
Or, a more concise data.table solution from #markus (thank you), and you can use !is.na(.) in lieu of . > 0 for the other solutions as well (and you do not have to replace NA with 0):
cols <- names(dat)[-1];
setDT(dat)[, lapply(.SD, function(x) as.integer(any(!is.na(x)))), .SDcol = cols, by = id]
id wt pain sbp dbp hr rr sat
1: 386 1 0 1 1 1 1 1
2: 2794 0 1 1 1 1 1 0
3: 2732 1 0 1 1 1 1 1
An option in base R
aggregate(.~ id, replace(dat, is.na(dat), 0), FUN =
function(x) as.integer(any(x > 0)), na.action = NULL)
# id wt pain sbp dbp hr rr sat
#1 386 1 0 1 1 1 1 1
#2 2732 1 0 1 1 1 1 1
#3 2794 0 1 1 1 1 1 0
Or with rowsum from base R
+(rowsum(+(dat[-1] > 0 & !is.na(dat[-1])), dat$id) != 0)
# wt pain sbp dbp hr rr sat
#386 1 0 1 1 1 1 1
#2732 1 0 1 1 1 1 1
#2794 0 1 1 1 1 1 0
Try:
library(tidyr)
library(dplyr)
dat %>%
gather(key, value, -id) %>%
mutate(value2 = if_else(is.na(value), 1, 0)) %>%
group_by(id, key) %>%
summarise(value2 = max(value2)) %>%
spread(key, value2)
In short:
convert to long format
create indicator variable
summarise over id and key
reconvert to wide format