Related
I'm trying to calculate percent change in R with each of the time points included in the column label (table below). I have dplyr loaded and my dataset was loaded in R and I named it data. Below is the code I'm using but it's not calculating correctly. I want to create a new dataframe called data_per_chg which contains the percent change from "v1" each variable from. For instance, for wbc variable, I would like to calculate percent change of wbc.v1 from wbc.v1, wbc.v2 from wbc.v1, wbc.v3 from wbc.v1, etc, and do that for all the remaining variables in my dataset. I'm assuming I can probably use a loop to easily do this but I'm pretty new to R so I'm not quite sure how proceed. Any guidance will be greatly appreciated.
id
wbc.v1
wbc.v2
wbc.v3
rbc.v1
rbc.v2
rbc.v3
hct.v1
hct.v2
hct.v3
a1
23
63
30
23
56
90
13
89
47
a2
81
45
46
N/A
18
78
14
45
22
a3
NA
27
14
29
67
46
37
34
33
data_per_chg<-data%>%
group_by(id%>%
arrange(id)%>%
mutate(change=(wbc.v2-wbc.v1)/(wbc.v1))
data_per_chg
Assuming the NA values are all NA and no N/A
library(dplyr)
library(stringr)
data <- data %>%
na_if("N/A") %>%
type.convert(as.is = TRUE) %>%
mutate(across(-c(id, matches("\\.v1$")), ~ {
v1 <- get(str_replace(cur_column(), "v\\d+$", "v1"))
(.x - v1)/v1}, .names = "{.col}_change"))
-output
data
id wbc.v1 wbc.v2 wbc.v3 rbc.v1 rbc.v2 rbc.v3 hct.v1 hct.v2 hct.v3 wbc.v2_change wbc.v3_change rbc.v2_change rbc.v3_change hct.v2_change hct.v3_change
1 a1 23 63 30 23 56 90 13 89 47 1.7391304 0.3043478 1.434783 2.9130435 5.84615385 2.6153846
2 a2 81 45 46 NA 18 78 14 45 22 -0.4444444 -0.4320988 NA NA 2.21428571 0.5714286
3 a3 NA 27 14 29 67 46 37 34 33 NA NA 1.310345 0.5862069 -0.08108108 -0.1081081
If we want to keep the 'v1' columns as well
data %>%
na_if("N/A") %>%
type.convert(as.is = TRUE) %>%
mutate(across(ends_with('.v1'), ~ .x - .x,
.names = "{str_replace(.col, 'v1', 'v1change')}")) %>%
transmute(id, across(ends_with('change')),
across(-c(id, matches("\\.v1$"), ends_with('change')),
~ {
v1 <- get(str_replace(cur_column(), "v\\d+$", "v1"))
(.x - v1)/v1}, .names = "{.col}_change")) %>%
select(id, starts_with('wbc'), starts_with('rbc'), starts_with('hct'))
-output
id wbc.v1change wbc.v2_change wbc.v3_change rbc.v1change rbc.v2_change rbc.v3_change hct.v1change hct.v2_change hct.v3_change
1 a1 0 1.7391304 0.3043478 0 1.434783 2.9130435 0 5.84615385 2.6153846
2 a2 0 -0.4444444 -0.4320988 NA NA NA 0 2.21428571 0.5714286
3 a3 NA NA NA 0 1.310345 0.5862069 0 -0.08108108 -0.1081081
data
data <- structure(list(id = c("a1", "a2", "a3"), wbc.v1 = c(23L, 81L,
NA), wbc.v2 = c(63L, 45L, 27L), wbc.v3 = c(30L, 46L, 14L), rbc.v1 = c("23",
"N/A", "29"), rbc.v2 = c(56L, 18L, 67L), rbc.v3 = c(90L, 78L,
46L), hct.v1 = c(13L, 14L, 37L), hct.v2 = c(89L, 45L, 34L), hct.v3 = c(47L,
22L, 33L)), class = "data.frame", row.names = c(NA, -3L))
I need to delete some rows in my dataset based on the given condition.
Kindly gothrough the sample data for reference.
ID Date Dur
123 01/05/2000 3
123 08/04/2002 6
564 04/04/2012 2
741 01/08/2011 5
789 02/03/2009 1
789 08/01/2010 NA
789 05/05/2011 NA
852 06/06/2015 3
852 03/02/2016 NA
155 03/02/2008 NA
155 01/01/2009 NA
159 07/07/2008 NA
My main concern is Dur column. I have to delete the rows which have Dur != NA for group ID's
i.e ID's(123,789,852) have more than one record/row with Dur value. so I need to remove the ID with Dur value, which means entire ID of 123 and first record of 789 and 852.
I don't want to delete any ID's(564,741,852) have Dur with single record or any other ID's with null in Dur.
Expected Output:
ID Date Dur
564 04/04/2012 2
741 01/08/2011 5
789 08/01/2010 NA
789 05/05/2011 NA
852 03/02/2016 NA
155 03/02/2008 NA
155 01/01/2009 NA
159 07/07/2008 NA
Kindly suggest a code to solve the issue.
Thanks in Advance!
One way would be to select rows where number of rows in the group is 1 or there are NA's rows in the data.
This can be written in dplyr as :
library(dplyr)
df %>% group_by(ID) %>% filter(n() == 1 | is.na(Dur))
# ID Date Dur
# <int> <chr> <int>
#1 564 04/04/2012 2
#2 741 01/08/2011 5
#3 789 08/01/2010 NA
#4 789 05/05/2011 NA
#5 852 03/02/2016 NA
#6 155 03/02/2008 NA
#7 155 01/01/2009 NA
#8 159 07/07/2008 NA
Using data.table :
library(data.table)
setDT(df)[, .SD[.N == 1 | is.na(Dur)], ID]
and base R :
subset(df, ave(is.na(Dur), ID, FUN = function(x) length(x) == 1 | x))
data
df <- structure(list(ID = c(123L, 123L, 564L, 741L, 789L, 789L, 789L,
852L, 852L, 155L, 155L, 159L), Date = c("01/05/2000", "08/04/2002",
"04/04/2012", "01/08/2011", "02/03/2009", "08/01/2010", "05/05/2011",
"06/06/2015", "03/02/2016", "03/02/2008", "01/01/2009", "07/07/2008"
), Dur = c(3L, 6L, 2L, 5L, 1L, NA, NA, 3L, NA, NA, NA, NA)),
class = "data.frame", row.names = c(NA, -12L))
We can use .I in data.table
library(data.table)
setDT(df1)[df1[, .I[.N == 1| is.na(Dur)], ID]$V1]
I am trying to merge 2 columns within the same dataset in order to condense the number of columns.
The dataset currently looks like this:
Year Var1 Var2
2014 NA 123
2014 NA 155
2015 541 NA
2015 432 NA
2016 NA 124
etc
I wish the dataset to look like
Year Var1/2
2014 123
2014 155
2015 541
2015 432
2016 124
Any Help is grealty apprecitated.
You should be able to just use with(mydf, pmax(Var1, Var2, na.rm = TRUE)).
Here's a sample data.frame. Note row 5.
mydf <- structure(list(Year = c(2014L, 2014L, 2015L, 2015L, 2016L), Var1 = c(NA,
NA, 541L, 432L, NA), Var2 = c(123L, 155L, NA, NA, NA)), .Names = c("Year",
"Var1", "Var2"), row.names = c(NA, 5L), class = "data.frame")
mydf
## Year Var1 Var2
## 1 2014 NA 123
## 2 2014 NA 155
## 3 2015 541 NA
## 4 2015 432 NA
## 5 2016 NA NA
with(mydf, pmax(Var1, Var2, na.rm = TRUE))
## [1] 123 155 541 432 NA
Assign it to a column and you're good to go.
can paste function help?
df$Var1/2 <- paste(df$Var1,df$Var2)
Date_Time C 4700C Put.15 4800C Put.16 4900C Put.17
1 20120531 NA NA NA NA NA NA NA
2 20120601 1445 4800 208 84.9 143.3 119.8 92 167
3 20120606 1100 4900 268.85 43 192 66.3 127 100
4 20120607 1500 5000 345 24 261 38.25 183 60.5
5 20120612 1515 NA NA NA NA NA NA NA
I have the above sample data frame, here i wants to search the values of 1st row for C column in all the column names and get back the values of the matching column as the result.
For example <- wants to search the value of 2nd row C column which is 4900, in first all the column names, and once it's found 4900C, gives me the result as all the values in 4900C for 2nd row.
Pls help
It would have been better if the delimiters in the data were clear. For example "Date_Time" as column name could have elements "20120531 NA" as a string.
We remove the non-numeric substring from the names of the 'df1' (subset it based on 'j2') using sub, match with the 'C' to get the column index ('j1'), get a logical index based on the NA values ('i1'), then with row/column index, we extract the elements from the proposed columns (df1[-(1:3)]`) and assign it to a "NewCol".
j2 <- grep("\\d+C", names(df1))
j1 <- match(df1$C, sub("\\D+", "", names(df1)[j2]))
i1 <- !is.na(j1)
df1$NewCol[i1] <- df1[j2][cbind((1:nrow(df1))[i1], j1[i1])]
df1
# Date Time C 4700C Put.15 4800C Put.16 4900C Put.17 NewCol
#1 20120531 NA NA NA NA NA NA NA NA NA
#2 20120601 1445 4800 208.00 84.9 143.3 119.80 92 167.0 143.3
#3 20120606 1100 4900 268.85 43.0 192.0 66.30 127 100.0 127.0
#4 20120607 1500 5000 345.00 24.0 261.0 38.25 183 60.5 NA
#5 20120612 1515 NA NA NA NA NA NA NA NA
NOTE: Here I am assuming that 'Time' is the second column
data
df1 <- structure(list(Date = c(20120531L, 20120601L, 20120606L, 20120607L,
20120612L), Time = c(NA, 1445L, 1100L, 1500L, 1515L), C = c(NA,
4800L, 4900L, 5000L, NA), `4700C` = c(NA, 208, 268.85, 345, NA
), Put.15 = c(NA, 84.9, 43, 24, NA), `4800C` = c(NA, 143.3, 192,
261, NA), Put.16 = c(NA, 119.8, 66.3, 38.25, NA), `4900C` = c(NA,
92L, 127L, 183L, NA), Put.17 = c(NA, 167, 100, 60.5, NA)),
.Names = c("Date",
"Time", "C", "4700C", "Put.15", "4800C", "Put.16", "4900C", "Put.17"
), class = "data.frame", row.names = c("1", "2", "3", "4", "5"))
This is my effort to reconstruct a question that appeared yesterday and which I worked most of the morning to solve but can no longer find the question. 2 datasets, df1 and a smaller df2, were provided with the same column names with request to replace only NA values in rows where the date column matched. I suppose a merge could have done it and might well have been less onerous, but I was seeking a match()-ing and indexing strategy and eventually found one:
df1 <- structure(list(date = c(20040101L, 20040115L, 20040131L, 20040205L,
20040228L, 20040301L, 20040315L, 20040331L), X11A = c(100L, 200L,
NA, NA, NA, 150L, NA, NA), X11A.1 = c(150L, NA, 165L, NA, NA,
155L, NA, NA), X21B = c(NA, 200L, 180L, NA, NA, 170L, 180L, NA
), X3CC = c(NA, NA, 190L, NA, NA, 150L, 190L, 175L), X3CC.1 = c(140L,
NA, 190L, NA, NA, 160L, 200L, 180L)), .Names = c("date", "X11A",
"X11A.1", "X21B", "X3CC", "X3CC.1"), class = "data.frame", row.names = c(NA,
-8L))
df2 <- structure(list(date = c(20040228L, 20040131L, 20040331L), X11A = c(140L,
170L, NA), X11A.1 = c(145L, NA, 145L), X21B = c(165L, NA, 160L
), X3CC = c(150L, NA, NA), X3CC.1 = c(155L, NA, NA)), .Names = c("date",
"X11A", "X11A.1", "X21B", "X3CC", "X3CC.1"), class = "data.frame", row.names = c(NA,
-3L))
What was actually offered:
df1:
date 11A 11A 21B 3CC 3CC
20040101 100 150 NA NA 140
20040115 200 NA 200 NA NA
20040131 NA 165 180 190 190
20040205 NA NA NA NA NA
20040228 NA NA NA NA NA
20040301 150 155 170 150 160
20040315 NA NA 180 190 200
20040331 NA NA NA 175 180
df2:
date 11A 11A 21B 3CC 3CC
20040228 140 145 165 150 155
20040131 170 NA NA NA NA
20040331 NA 145 160 NA NA
The is.na function can create a "template" of logicals from a dataframe argument. My goal was to create such a template and then select only the rows with a match result between the two date columns. Using which with arr.ind=TRUE give a two column matrix that can be used as a single argument to either [<- or [:
valpos <- which(is.na(df1)[match(df2$date, df1$date), ], arr.ind=TRUE)
The next task is to convert the first column (named "row") so that the correct row is substituted for the "target" dataframe:
targpos <- cbind( match(df2$date, df1$date)[ valpos[,'row'] ] ,
valpos[,'col'])
Then it's just:
> df1[targpos] <- df2[valpos]
> df1
date X11A X11A.1 X21B X3CC X3CC.1
1 20040101 100 150 NA NA 140
2 20040115 200 NA 200 NA NA
3 20040131 170 165 180 190 190
4 20040205 NA NA NA NA NA
5 20040228 140 145 165 150 155
6 20040301 150 155 170 150 160
7 20040315 NA NA 180 190 200
8 20040331 NA 145 160 175 180
I did make the problem a bit more difficult when I shuffled the order to the dates. I think this logic solves that difficultly as well.
The following solution precomputes (1) the row mapping from df2 to df1 according to the date column and (2) the common data column names between the two data.frames. It then iterates over the common columns, and for each it tests which cells within the df1 column are both mapped to df2 and have a value of NA, and then assigns those cells from whatever value is available in df2.
Advantages:
Does not require perfectly corresponding column sets; will match them up by column name.
Processes the minimal amount of data, by only iterating over the common columns, only NA-testing the df1 cells that map to df2, and only assigning the NA cells.
Protects the input column data types. IOW, if df1 had heterogeneous column types, those types would not be corrupted by this operation.
rms <- match(df2$date,df1$date);
cms <- intersect(names(df1)[-1L],names(df2)[-1L]);
for (cm in cms) { n <- is.na(df1[[cm]][rms]); df1[[cm]][rms][n] <- df2[[cm]][n]; };
df1;
## date X11A X11A.1 X21B X3CC X3CC.1
## 1 20040101 100 150 NA NA 140
## 2 20040115 200 NA 200 NA NA
## 3 20040131 170 165 180 190 190
## 4 20040205 NA NA NA NA NA
## 5 20040228 140 145 165 150 155
## 6 20040301 150 155 170 150 160
## 7 20040315 NA NA 180 190 200
## 8 20040331 NA 145 160 175 180
Benchmarking
library(microbenchmark);
`42` <- function(df1,df2) { valpos <- which(is.na(df1)[match(df2$date,df1$date),],arr.ind=TRUE); targpos <- cbind(match(df2$date,df1$date)[valpos[,'row']],valpos[,'col']); df1[targpos] <- df2[valpos]; df1; };
bgoldst <- function(df1,df2) { rms <- match(df2$date,df1$date); cms <- intersect(names(df1)[-1L],names(df2)[-1L]); for (cm in cms) { n <- is.na(df1[[cm]][rms]); df1[[cm]][rms][n] <- df2[[cm]][n]; }; df1; };
identical(`42`(df1,df2),bgoldst(df1,df2));
## [1] TRUE
microbenchmark(`42`(df1,df2),bgoldst(df1,df2));
## Unit: microseconds
## expr min lq mean median uq max neval
## `42`(df1, df2) 297.219 309.1935 340.1425 319.0295 333.9975 1236.771 100
## bgoldst(df1, df2) 175.766 181.7530 192.9317 188.1670 198.2180 316.463 100
set.seed(1L);
NR1 <- 10000L; NC1 <- 300L; NR2 <- 1000L; NC2 <- 300L; probNA1 <- 0.5; probNA2 <- 0.1;
df1 <- data.frame(date=as.integer(format(sort(sample(seq(as.Date('2004-01-01'),by=1L,len=NR1*5L),NR1)),'%Y%m%d')));
df1[paste0('X',seq_len(NC1))] <- matrix(sample(c(NA,100:200),NR1*NC1,T,c(probNA1,rep((1-probNA1)/101,101L))),NR1);
df2 <- data.frame(date=sample(df1$date,NR2));
df2[paste0('X',seq_len(NC2))] <- matrix(sample(c(NA,100:200),NR2*NC2,T,c(probNA2,rep((1-probNA2)/101,101L))),NR2);
identical(`42`(df1,df2),bgoldst(df1,df2));
## [1] TRUE
microbenchmark(`42`(df1,df2),bgoldst(df1,df2));
## Unit: milliseconds
## expr min lq mean median uq max neval
## `42`(df1, df2) 149.61503 194.66606 216.16916 231.25129 233.68079 277.24701 100
## bgoldst(df1, df2) 29.17145 31.32318 37.85904 32.15154 33.24013 75.47765 100