I have a dataframe that looks something like this, in which I have several rows for each user, and many NAs in the columns.
user
Effect T1
Effect T2
Effect T3
Benchmark T1
Benchmark T2
Benchmark T3
Tom
01
NA
NA
02
NA
NA
Tom
NA
07
NA
NA
08
NA
Tom
NA
NA
13
NA
NA
14
Larry
03
NA
NA
04
NA
NA
Larry
NA
09
NA
NA
10
NA
Larry
NA
NA
15
NA
NA
16
Dave
05
NA
NA
06
NA
NA
Dave
NA
11
NA
NA
12
NA
Dave
NA
NA
17
NA
NA
18
I want to collapse the columns using the name and filling the values from reach row, this this.
user
Effect T1
Effect T2
Effect T3
Benchmark T1
Benchmark T2
Benchmark T3
Tom
01
07
13
02
08
14
Larry
03
09
15
04
10
16
Dave
05
11
17
06
12
18
How might I accomplish this?
Thank you in advance for your help. Update: I've added the dput of a subset of the actual data below.
structure(list(name = c("Abraham_Ralph", "Abraham_Ralph", "Abraham_Ralph",
"Ackerman_Gary", "Adams_Alma", "Adams_Alma", "Adams_Alma", "Adams_Alma",
"Adams_Sandy", "Aderholt_Robert", "Aderholt_Robert", "Aderholt_Robert",
"Aderholt_Robert", "Aderholt_Robert", "Aguilar_Pete", "Aguilar_Pete",
"Aguilar_Pete"), state = c("LA", "LA", "LA", "NY", "NC", "NC",
"NC", "NC", "FL", "AL", "AL", "AL", "AL", "AL", "CA", "CA", "CA"
), seniority = c(1, 2, 3, 15, 1, 2, 3, 4, 1, 8, 9, 10, 11, 12,
1, 2, 3), legeffect_112 = c(NA, NA, NA, 0.202061712741852, NA,
NA, NA, NA, 1.30758035182953, 3.73544979095459, NA, NA, NA, NA,
NA, NA, NA), legeffect_113 = c(NA, NA, NA, NA, 0, NA, NA, NA,
NA, NA, 0.908495426177979, NA, NA, NA, NA, NA, NA), legeffect_114 = c(2.07501077651978,
NA, NA, NA, NA, 0.84164834022522, NA, NA, NA, NA, NA, 0.340001106262207,
NA, NA, 0.10985741019249, NA, NA), legeffect_115 = c(NA, 0.493490308523178,
NA, NA, NA, NA, 0.587624311447144, NA, NA, NA, NA, NA, 0.159877583384514,
NA, NA, 0.730929613113403, NA), legeffect_116 = c(NA, NA, 0.0397605448961258,
NA, NA, NA, NA, 1.78378939628601, NA, NA, NA, NA, NA, 0.0198802724480629,
NA, NA, 0.0497006773948669), benchmark_112 = c(NA, NA, NA, 0.738679468631744,
NA, NA, NA, NA, 0.82908970117569, 1.39835929870605, NA, NA, NA,
NA, NA, NA, NA), benchmark_113 = c(NA, NA, NA, NA, 0.391001850366592,
NA, NA, NA, NA, NA, 1.58223271369934, NA, NA, NA, NA, NA, NA),
benchmark_114 = c(1.40446054935455, NA, NA, NA, NA, 0.576326191425323,
NA, NA, NA, NA, NA, 1.42212760448456, NA, NA, 0.574363172054291,
NA, NA), benchmark_115 = c(NA, 1.3291300535202, NA, NA, NA,
NA, 0.537361204624176, NA, NA, NA, NA, NA, 1.45703768730164,
NA, NA, 0.523149251937866, NA), benchmark_116 = c(NA, NA,
0.483340591192245, NA, NA, NA, NA, 1.31058621406555, NA,
NA, NA, NA, NA, 0.751261711120605, NA, NA, 1.05683290958405
)), row.names = c(NA, -17L), class = c("tbl_df", "tbl", "data.frame"
))
A data.table solution:
# melt data, remove NA, then recast ...
dt <- dcast(melt(data.table(d), "name")[!value %in% NA], name ~ variable)
dcast(melt(data.table(d), "name")[!value %in% c(NA) & !variable %in% c("variable", "seniority", "state")], name ~ variable)
name legeffect_112 legeffect_113 legeffect_114 legeffect_115 legeffect_116 benchmark_112 benchmark_113 benchmark_114 benchmark_115 benchmark_116
1: Abraham_Ralph <NA> <NA> 2.07501077651978 0.493490308523178 0.0397605448961258 <NA> <NA> 1.40446054935455 1.3291300535202 0.483340591192245
2: Ackerman_Gary 0.202061712741852 <NA> <NA> <NA> <NA> 0.738679468631744 <NA> <NA> <NA> <NA>
3: Adams_Alma <NA> 0 0.84164834022522 0.587624311447144 1.78378939628601 <NA> 0.391001850366592 0.576326191425323 0.537361204624176 1.31058621406555
4: Adams_Sandy 1.30758035182953 <NA> <NA> <NA> <NA> 0.82908970117569 <NA> <NA> <NA> <NA>
5: Aderholt_Robert 3.73544979095459 0.908495426177979 0.340001106262207 0.159877583384514 0.0198802724480629 1.39835929870605 1.58223271369934 1.42212760448456 1.45703768730164 0.751261711120605
6: Aguilar_Pete <NA> <NA> 0.10985741019249 0.730929613113403 0.0497006773948669 <NA> <NA> 0.574363172054291 0.523149251937866 1.05683290958405
Data/Setup
# Load data.table
# install.packages("data.table")
library(data.table)
# Read example data
d <- structure(list(name = c("Abraham_Ralph", "Abraham_Ralph", "Abraham_Ralph",
"Ackerman_Gary", "Adams_Alma", "Adams_Alma", "Adams_Alma", "Adams_Alma",
"Adams_Sandy", "Aderholt_Robert", "Aderholt_Robert", "Aderholt_Robert",
"Aderholt_Robert", "Aderholt_Robert", "Aguilar_Pete", "Aguilar_Pete",
"Aguilar_Pete"), state = c("LA", "LA", "LA", "NY", "NC", "NC",
"NC", "NC", "FL", "AL", "AL", "AL", "AL", "AL", "CA", "CA", "CA"
), seniority = c(1, 2, 3, 15, 1, 2, 3, 4, 1, 8, 9, 10, 11, 12,
1, 2, 3), legeffect_112 = c(NA, NA, NA, 0.202061712741852, NA,
NA, NA, NA, 1.30758035182953, 3.73544979095459, NA, NA, NA, NA,
NA, NA, NA), legeffect_113 = c(NA, NA, NA, NA, 0, NA, NA, NA,
NA, NA, 0.908495426177979, NA, NA, NA, NA, NA, NA), legeffect_114 = c(2.07501077651978,
NA, NA, NA, NA, 0.84164834022522, NA, NA, NA, NA, NA, 0.340001106262207,
NA, NA, 0.10985741019249, NA, NA), legeffect_115 = c(NA, 0.493490308523178,
NA, NA, NA, NA, 0.587624311447144, NA, NA, NA, NA, NA, 0.159877583384514,
NA, NA, 0.730929613113403, NA), legeffect_116 = c(NA, NA, 0.0397605448961258,
NA, NA, NA, NA, 1.78378939628601, NA, NA, NA, NA, NA, 0.0198802724480629,
NA, NA, 0.0497006773948669), benchmark_112 = c(NA, NA, NA, 0.738679468631744,
NA, NA, NA, NA, 0.82908970117569, 1.39835929870605, NA, NA, NA,
NA, NA, NA, NA), benchmark_113 = c(NA, NA, NA, NA, 0.391001850366592,
NA, NA, NA, NA, NA, 1.58223271369934, NA, NA, NA, NA, NA, NA),
benchmark_114 = c(1.40446054935455, NA, NA, NA, NA, 0.576326191425323,
NA, NA, NA, NA, NA, 1.42212760448456, NA, NA, 0.574363172054291,
NA, NA), benchmark_115 = c(NA, 1.3291300535202, NA, NA, NA,
NA, 0.537361204624176, NA, NA, NA, NA, NA, 1.45703768730164,
NA, NA, 0.523149251937866, NA), benchmark_116 = c(NA, NA,
0.483340591192245, NA, NA, NA, NA, 1.31058621406555, NA,
NA, NA, NA, NA, 0.751261711120605, NA, NA, 1.05683290958405
)), row.names = c(NA, -17L), class = c("tbl_df", "tbl", "data.frame"
))
This solution is using only the base functions (no extra packages), but the one-liner may cause eyes to cross, so I'll split it into several functions.
The plan is the following:
Split the original data.frame by the values in name column, using the function by;
For each partition of the data.frame, collapse the columns;
A collapsed column returns the max value of the column, or NA if all its values are NA;
The collapsed data.frame partitions are stacked together.
So, this is a function that does that:
dfr_collapse <- function(dfr, col0)
{
# Collapse the columns of the data.frame "dfr" grouped by the values of
# the column "col0"
# Max/NA function
namax <- function(x)
{
if(all(is.na(x)))
NA # !!!
else
max(x, na.rm=TRUE)
}
# Column collapse function
byfun <- function(x)
{
lapply(x, namax)
}
# Stack the partitioning results
return(do.call(
what = rbind,
args = by(dfr, dfr[[col0]], byfun)
))
}
May not look as slick as a one-liner, but it does the job. It can be tunrned into a one-liner, but you don't want that.
Assuming that df0 is the data.frame from you dput, you can test this function with
dfr_collapse(df0)
Nota bene: for the sake of simplicity, I return an NA of type logical (see the comment # !!! above). The correct code should convert that NA to the mode of the x vector. Also, the function should check the type of its inputs, etc.
Related
Df <- data.frame(prop1 = c(NA, NA, NA, "French", NA, NA,NA, "-29 to -20", NA, NA, NA, "Pop", NA, NA, NA, "French", "-29 to -20", "Pop"),
prop1_rank = c(NA, NA, NA, 0, NA, NA,NA, 11, NA, NA, NA, 1, NA, NA, NA, 40, 0, 2),
prop2 = c(NA, NA, NA, "Spanish", NA, NA,NA, "-19 to -10", NA, NA, NA, "Rock", NA, NA, NA, "Spanish", "-19 to -10", "Rock"),
prop2_rank = c(NA, NA, NA, 10, NA, NA,NA, 4, NA, NA, NA, 1, NA, NA, NA, 1, 0, 2),
initOSF1 = c(NA, NA, NA, NA, NA, "French", NA,NA,NA, "-29 to -20", NA, NA, NA, "Pop", NA, NA, NA, NA),
initOSF1_freq = c(NA, NA, NA, NA, NA, 66, NA,NA,NA, 0, NA, NA, NA, 14, NA, NA, NA, NA),
initOSF2 = c(NA, NA, NA, NA, NA, "Spanish", NA,NA,NA, "-19 to -10", NA, NA, NA, "Rock", NA, NA, NA, NA),
initOSF2_freq = c(NA, NA, NA, NA, NA, 0, NA,NA,NA, 6, NA, NA, NA, 14, NA, NA, NA, NA))
Df
I would like to organize this into
3 columns consisting: c("propositions", "ranks", "freqs"),
where,
Propositions column has the values: "French", "Spanish", "-29 to -20", "19 to -10", "Pop", "Rock", and having a separate columns for the rank values e.g., 0 for French, 10 for Spanish, etc., and frequency values e.g., 66 for French, 0 for Spanish, etc.
This is not an easy one. Probably a better solution exists:
library(tidyverse)
library(data.table)
setDT(Df) %>%
select(contains(c('prop', 'rank', 'freq'))) %>%
filter(!if_all(everything(), is.na)) %>%
melt(measure.vars = patterns(c('prop.$', 'rank$', 'freq'))) %>%
group_by(gr=cumsum(!is.na(value1)))%>%
summarise(across(-variable, ~if(length(.x)>1) na.omit(.x) else .x))
# A tibble: 12 x 4
gr value1 value2 value3
<int> <chr> <dbl> <dbl>
1 1 French 0 66
2 2 -29 to -20 11 0
3 3 Pop 1 14
4 4 French 40 NA
5 5 -29 to -20 0 NA
6 6 Pop 2 NA
7 7 Spanish 10 0
8 8 -19 to -10 4 6
9 9 Rock 1 14
10 10 Spanish 1 NA
11 11 -19 to -10 0 NA
12 12 Rock 2 NA
I have data.frame that looks like this:
I want to quickly reshape it so I will only one record for each ID, something that is looks like this:
df can be build using codes:
df<-structure(list(ID = structure(c("05-102", "05-102", "05-102",
"01-103", "01-103", "01-103", "08-104", "08-104", "08-104", "05-105",
"05-105", "05-105", "02-106", "02-106", "02-106", "05-107", "05-107",
"05-107", "08-108", "08-108", "08-108", "02-109", "02-109", "02-109",
"05-111", "05-111", "05-111", "07-115", "07-115", "07-115"), label = "Unique Subject Identifier", format.sas = "$"),
EXSTDTC1 = structure(c(NA, NA, NA, 17022, NA, NA, 17024,
NA, NA, 17032, NA, NA, 17038, NA, NA, 17092, NA, NA, 17108,
NA, NA, 17155, NA, NA, 17247, NA, NA, 17333, NA, NA), class = "Date"),
EXSTDTC6 = structure(c(NA, 16885, NA, NA, NA, 17031, NA,
NA, 17032, NA, NA, 17041, NA, NA, 17047, NA, NA, 17100, NA,
NA, 17116, NA, 17164, NA, NA, NA, 17256, NA, 17342, NA), class = "Date"),
EXSTDTC3 = structure(c(NA, NA, 16881, NA, 17027, NA, NA,
17029, NA, NA, 17037, NA, NA, 17043, NA, NA, 17097, NA, NA,
17113, NA, NA, NA, 17160, NA, 17252, NA, NA, NA, 17338), class = "Date"),
EXDOSEA1 = c("73.8+147.6", NA, NA, "64.5+129", NA, NA, "62.7+125.4",
NA, NA, "114+57", NA, NA, "60+117.5", NA, NA, "48.6+97.2",
NA, NA, "61.2+122.4", NA, NA, "47.7+95.4", NA, NA, "51.6+103.2",
NA, NA, "68+136", NA, NA), EXDOSEA6 = c(NA, "100", NA, NA,
NA, "86", NA, NA, "83.5", NA, NA, "76", NA, NA, "39.2", NA,
NA, "32", NA, NA, "81.5", NA, "69.6", NA, NA, NA, "68", NA,
"91", NA), EXDOSEA3 = c(NA, NA, "1600", NA, "4302", NA, NA,
"4185", NA, NA, "3900", NA, NA, "3921", NA, NA, "3300", NA,
NA, "4080", NA, NA, NA, "3183", NA, "3300", NA, NA, NA, "1514"
)), row.names = c(NA, -30L), class = c("tbl_df", "tbl", "data.frame"
))
right now I have my codes as:
df %>%
group_by(ID) %>%
summarise(across(EXSTDTC1:EXDOSEA3, na.omit))
But it seems remove the 05-102 as it did not have value on EXSTDTC1. I would like to see how we can address this. Is it possible to keep across still?
Many thanks.
We could use an if/else condition to address those cases where there is only NA
library(dplyr)
df %>%
group_by(ID) %>%
summarise(across(EXSTDTC1:EXDOSEA3,
~ if(all(is.na(.))) NA else .[complete.cases(.)]), .groups = 'drop')
-output
# A tibble: 10 x 7
# ID EXSTDTC1 EXSTDTC6 EXSTDTC3 EXDOSEA1 EXDOSEA6 EXDOSEA3
# <chr> <date> <date> <date> <chr> <chr> <chr>
# 1 01-103 2016-08-09 2016-08-18 2016-08-14 64.5+129 86 4302
# 2 02-106 2016-08-25 2016-09-03 2016-08-30 60+117.5 39.2 3921
# 3 02-109 2016-12-20 2016-12-29 2016-12-25 47.7+95.4 69.6 3183
# 4 05-102 NA 2016-03-25 2016-03-21 73.8+147.6 100 1600
# 5 05-105 2016-08-19 2016-08-28 2016-08-24 114+57 76 3900
# 6 05-107 2016-10-18 2016-10-26 2016-10-23 48.6+97.2 32 3300
# 7 05-111 2017-03-22 2017-03-31 2017-03-27 51.6+103.2 68 3300
# 8 07-115 2017-06-16 2017-06-25 2017-06-21 68+136 91 1514
# 9 08-104 2016-08-11 2016-08-19 2016-08-16 62.7+125.4 83.5 4185
#10 08-108 2016-11-03 2016-11-11 2016-11-08 61.2+122.4 81.5 4080
I am trying to parse multiple columns into each of their components. However the number of components varies across the columns. Specifically, suppose the following df:
id X1.startAll X2.startAll
1 ["1555726884484","1555727530298","1555727532509"]
2 ["1555735159384","1555735161545"]
3 ["1555730029709"]
4 ["1555735159384","1555735161545"]
5
6 ["1555735159384","1555735161545"]
now I have 40 of these columns (and another 120 very similar ones, to which I aim to generalize the process) and many more rows. I can do the first column quite simply using the following:
df1$X1.startAll1 <- str_replace(df1$X1.startAll, "\\[\"([0-9]+)\",*\"*([0-9]*)\"*,*\"*([0-9]*)\"*\\]", "\\1")
df1$X1.startAll2 <- str_replace(df1$X1.startAll, "\\[\"([0-9]+)\",*\"*([0-9]*)\"*,*\"*([0-9]*)\"*\\]", "\\2")
df1$X1.startAll3 <- str_replace(df1$X1.startAll, "\\[\"([0-9]+)\",*\"*([0-9]*)\"*,*\"*([0-9]*)\"*\\]", "\\3")
which yields my desired result:
id X1.startAll X1.startAll1 X1.startAll2 X1.startAll3
1 ["1555726884484","1555727530298","1555727532509"] 1555726884484 1555727530298 1555727532509
2
3 ["1555730029709"] 1555730029709
4 ["1555735159384","1555735161545"] 1555735159384 1555735161545
5
6
However, I have to do this for many columns and for many different 'array' lengths within each of these.
I have tried automating this using a for loop, however, I (1) can't figure out how to read the right number of iterations (i.e. the max the number of components in the startAll column), (2) dynamically create the variables, (3) nor how to update the string extraction dynamically ("\\i").
Any and all help on looping this process would help a lot!
Edit 2: below is a copy-pasteable sample of the data:
structure(list(X1.startAll = list(NA, NA, NA, NA, c(1555726884484,
1555727530298, 1555727532509), NA, NA, c(1555735159384, 1555735161545
), NA, NA, NA, 1555730029709, NA, NA, NA, c(1555728423843, 1555728561054,
1555728586917), c(1555725657389, 1555725657827), c(1555703810672,
1555703823206, 1555703848659), NA, NA), X2.startAll = list(NA,
NA, NA, NA, c(1555727541885, 1555727786959, 1555727897893
), NA, NA, 1555735262052, c(1555737694350, 1555737696711),
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), X3.startAll = list(
NA, NA, NA, NA, c(1555727920770, 1555728230065, 1555728843391
), NA, NA, c(1555735331144, 1555735452321, 1555735457305),
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), X4.startAll = list(
NA, NA, NA, NA, 1555728854666, NA, NA, 1555735589629, 1555738374484,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), X5.startAll = list(
NA, NA, NA, NA, c(1555728949327, 1555728988444), NA, NA,
c(1555735646258, 1555735912372, 1555735914267, 1555736071856,
1555736074184, 1555736093411, 1555736124826, 1555736238538,
1555736248889, 1555736576754, 1555736620915, 1555736874386,
1555737698921, 1555737777400, 1555737966562, 1555738152090,
1555738354075, 1555738700232, 1555738703134, 1555738716736
), 1555738415269, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), X6.startAll = list(NA, NA, NA, NA, 1555729661240, NA,
NA, NA, 1555738960285, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA), X7.startAll = list(NA, NA, NA, NA, c(1555730266934,
1555730356654, 1555730533798, 1555730535289), NA, c(1555732523945,
1555733415340, 1555733477452, 1555733748200, 1555734007271, 1555734286685,
1555734288597), NA, c(1555739871726, 1555740315324, 1555740328252,
1555740329835, 1555740538272, 1555741140561, 1555741143555, 1555741152932
), c(1555743562826, 1555743566386, 1555743593201), NA, NA, NA,
c(1555727969354, 1555727985539, 1555728064237, 1555738166838,
1555826735910), NA, NA, NA, NA, NA, NA), X8.startAll = list(
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA)), row.names = c(NA, -20L), class = "data.frame")
I have a file which contains some Order_IDs and their Externsion_ID if exists. A new order can be fresh order, or an extension of existing Order_ID or an extension of an existing extension.
My problem is to add a new column named Parent_ID which marks the root of the Order_ID.
Please find the expected output as below :
A reproducible input is attached below.
df1 = structure(list(Order_ID = c("SL158", "SL159", "SL160", "SL162",
"SL164", "SL165", "SL168", "SL169", "SL170", "SL171", "SL172",
"SL176", "SL177", "SL178", "SL179", "SL180", "SL183", "SL184",
"SL189", "SL190", "SL191", "SL192", "SL193", "SL195", "SL196",
"SL199", "SL200", "SL201", "SL207", "SL208", "SL209", "SL218",
"SL219", "SL223", "SL224", "SL225", "SL226", "SL227", "SL229",
"SL232", "SL233", "SL234", "SL235", "SL239", "SL240", "SL241",
"SL242", "SL243", "SL251", "SL252", "SL257", "SL258", "SL260",
"SL261", "SL262", "SL266", "SL267", "SL268", "SL269", "SL277",
"SL278", "SL279", "SL280", "SL281", "SL287", "SL288", "SL289",
"SL300", "SL301", "SL302", "SL303", "SL304", "SL305", "SL315",
"SL316", "SL322", "SL323", "SL327", "SL328", "SL333", "SL334",
"SL335", "SL336", "SL337", "SL340", "SL341", "SL342", "SL343",
"SL344", "SL345", "SL350", "SL351", "SL352", "SL353", "SL354",
"SL355", "SL363", "SL364", "SL365", "SL366", "SL367", "SL368",
"SL369", "SL370", "SL376", "SL377", "SL378", "SL379", "SL380",
"SL381", "SL382", "SL383", "SL384", "SL385", "SL1217", "SL1452",
"SL4316", "SL4317", "SL4348", "SL4381", "SL4681", "SL4738", "SL5319",
"SL5520", "SL5703", "SL6132", "SL6244", "SL6855", "SL6997", "SLB1253161",
"SLB2970530", "SLB27287329", "SLB36502009", "SLB81913180", "SLB82838226",
"SLB90244936", "SLB99701642", "SL11995", "SLH5317239", "SLH22149557",
"SLH44727392", "SLH45803004", "SLH57801072", "SLH74470000", "SLH79063451",
"SL1134", "SL1011", "SL3686", "SL3691", "SL3695", "SL3716", "SL3718",
"SL3720", "SL3721", "SL3727", "SL5242", "SL5245", "SL5246", "SL5254",
"SL5255", "SL10126", "SL10134", "SL10143", "SL11333", "SL11338",
"SL11365", "SL11377", "SL11384", "SL10004", "SL10046", "SL10058",
"SL10070", "SL10092", "SL11335", "SL11364", "SL11366"),
Extension_Of = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, "SL1134", "SL1011", "SL3691", "SL3718", "SL3727", "SL3695",
"SL3720", "SL3716", "SL3721", "SL5242", "SL5246", "SL5245", "SL5254",
"SL5255", "SL3686", "SL11365", "SL11384", "SL11377", "SL10134",
"SL11333", "SL10143", "SL11338", "SL10126", "SL10046", "SL10070",
"SL11364", "SL11335", "SL10004", "SL10058", "SL11366", "SL10092",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, "SL384", NA, NA, "SL171", NA,
NA, NA)),
row.names = c(NA, -176L),
class = c("tbl_df", "tbl", "data.frame"))
head(df1)
# Order_ID Extension_Of
#1 SL158 <NA>
#2 SL159 <NA>
#3 SL160 <NA>
#4 SL162 <NA>
#5 SL164 <NA>
#6 SL165 <NA>
Here is a solution based on igraph:
library(igraph) # 1.2.1
v <- data.frame(name = unique(unlist(df1)), stringsAsFactors = FALSE)
v <- v[!is.na(v$name), ]
g <- graph_from_data_frame(df1[!is.na(df1$Extension_Of), 2:1], vertices = v)
df1$Parent_ID <- sapply(df1$Order_ID, function(oid){
n <- ego(g, order = nrow(df1), oid, mode = 'in')[[1]]
nin <- lapply(n, function(x){ego(g, order = nrow(df1), x, mode = 'in')[[1]]})
root <- n[lengths(nin) == 1]$name
})
df1[df1$Parent_ID == 'SL384', ]
# Order_ID Extension_Of Parent_ID
# 113 SL384 <NA> SL384
# 138 SL11995 SL10046 SL384
# 170 SL10046 SL384 SL384
This answer is inspired by this answer and this function.
The rationale: Each line without NA in df1 can be treated as an edge in a graph. if B is extension of A, we have an edge A -> B. If C is extension of B, we get B->C. Then the problem can be rephrased as: for each node (Order_ID), find its root node. For C, its root node is A since (A->B->C).
In the code above, for Order_ID, ego finds all the nodes that are directly or indirectly upstream of it (including itself). Among those upstream nodes, we can determine the root node as the one without other upstream nodes.
I am trying to write a function that will uses values from variables stored in different columns to generate a new variable. The logic requires a series of ifelse statements. However, the final statement is always evaluating to true and I don't understand why.
Even more puzzling when I generated data for the MWE then the function works fine. But it still behaves bizarrely with a sample from the real data. I am guessing there is something in my environment that is causing mischief but I am now lost as how to investigate further.
FWIW I have tried writing this function in data.table and now dplyr syntax, and I get similar problems with both approaches.
Simpler functions that don't use ifelse statements seem to behave just fine.
gen_sofa_c <- function(data, map=NA, noradr=NA, dopa=NA, adr=NA, vasopressin=NA) {
library(dplyr)
# Extract the arguments and force conversion to string
pars <- as.list(match.call()[-1])
vasopressin <- as.character(pars$vasopressin)
noradr <- as.character(pars$noradr)
adr <- as.character(pars$adr)
dopa <- as.character(pars$dopa)
map <- as.character(pars$map)
# Default to NA
# if ("sofa_c" %in% names(data)) data$sofa_c <- NULL
# data$sofa_c <- as.numeric(NA)
return(
data %>%
# # Return 0 if MAP >= 70
mutate(sofa_c = ifelse(!is.na(map) & map >= 70, 0 , NA)) %>%
# # Return 1 if MAP < 70
mutate(sofa_c = ifelse(!is.na(map) & map < 70, 1 , sofa_c)) %>%
# # Return SOFA 2 if norad OR adr > 0.0 or dopamine > 5
mutate(sofa_c = ifelse(!is.na(noradr) & noradr > 0.0 , 2 , sofa_c)) %>%
# # Return SOFA 3 if norad OR adr > 0.1 or dopamine > 15
mutate(sofa_c = ifelse(!is.na(noradr) & noradr > 0.1 , 3 , sofa_c)) %>%
# # Return SOFA 4 if on vasopressin
mutate(sofa_c = ifelse(!is.na(vasopressin) & vasopressin > 0, 4 , sofa_c)) %>%
# Return sofa_c
select(sofa_c)
)
}
Here is the simulated data
# Simulate data
set.seed(1234)
tdata <- data.table(map=round(rnorm(100,70,10)), noradr=round(rnorm(100,0,1),2), vasopressin=sample(c(rep(NA,9),1)))
tdata[, noradr := ifelse(noradr < 0, NA, noradr)]
sofa_c <- gen_sofa_c(tdata, map=map, noradr=rx_norad, dopa=rx_dopa, adr=rx_adre, vasopressin=rx_vasopr)
table(sofa_c)
(cbind(tdata, sofa_c))
My output is this
R> table(sofa_c)
sofa_c
0 1 2 3 4
17 27 4 42 10
R> head((cbind(tdata, sofa_c)),10)
map noradr vasopressin sofa_c
1: 58 0.41 NA 3
2: 73 NA NA 0
3: 81 0.07 1 4
4: 47 NA NA 1
5: 74 NA NA 0
6: 75 0.17 NA 3
7: 64 NA NA 1
8: 65 0.17 NA 3
9: 64 0.35 NA 3
10: 61 NA NA 1
Here is the real data (as sample from >2 million rows)
nrow(ddata)
rdata <- ddata[runif(100,1,nrow(ddata)),.(map,norad=rx_norad,vasopressin=rx_vasopr)]
dput(rdata)
rm(sofa_c)
sofa_c <- gen_sofa_c(rdata, map=map, noradr=rx_norad, dopa=rx_dopa, adr=rx_adre, vasopressin=rx_vasopr)
table(sofa_c)
head((cbind(rdata, sofa_c)),10)
Here is the sample from the real data
R> dput(rdata)
structure(list(map = c(80, 82, 76, NA, 87, NA, NA, NA, NA, NA,
NA, NA, NA, 124, 65, 63, NA, 70, NA, NA, NA, NA, NA, NA, NA,
100, NA, NA, NA, NA, 85, 85, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 97, 84, 0, 84, NA, 75, NA, NA, NA, 67, NA, 58, NA, 153,
122, NA, NA, 91, 90, NA, NA, 87, NA, 60, 72, 107, 62, NA, NA,
97, 88, NA, NA, NA, 60, 81, 80, NA, NA, 82, 72, NA, 98, NA, NA,
80, 82, NA, NA, NA, 68, NA, NA, 126, 90, 65, 67, NA), norad = c(NA,
NA, NA, NA, 0, NA, NA, 0.14, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 0.18, 0.00952381, NA, NA, 0.12962963, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 0.172222, NA, NA, NA, NA, NA, 0.0623529, NA,
NA, NA, NA, 0.29005848, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.221667,
NA, NA, NA, NA, NA, 0.02, NA, NA, NA, NA, NA, 0.08, NA, NA, NA,
NA, NA, NA, NA, NA, 0.284444444, NA, NA, 0.19, NA, NA, NA, NA,
4, NA, NA), vasopressin = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 2, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("map",
"norad", "vasopressin"), row.names = c(NA, -100L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x101810978>)
Here is the real data output
R> table(sofa_c)
sofa_c
3 4
99 1
R> head((cbind(rdata, sofa_c)),10)
map norad vasopressin sofa_c
1: 80 NA NA 3
2: 82 NA NA 3
3: 76 NA NA 3
4: NA NA NA 3
5: 87 0.00 NA 3
6: NA NA NA 3
7: NA NA NA 3
8: NA 0.14 NA 3
9: NA NA NA 3
10: NA NA NA 3