Sort rows grouped by grep alphabetically - r

I have a dataframe with a row full of adverse events but also relationships of these adverse events to the procedure, like this:
df <- data.frame(
adverse_event = c(
"Haemorrhage", "related", "likely related",
"Other", "related", "likely related", "Pain", "related", "likely related",
"Subcapsular hematoma", "related", "likely related", "Ascites",
"related", "likely related", "Hyperbilirubinemia", "related",
"likely related", "Liver abscess", "related", "likely related",
"Pleural effusion with drainage", "related", "likely related",
"Pneumothorax", "related", "likely related", "Biliary leakage / occlusion / fistula",
"related", "likely related", "Portal vein thrombosis", "related",
"likely related", "Sepsis", "related", "likely related"
),
grade_1 = c(
4L, 4L, 0L, 3L, 6L, 1L, 8L, 4L, 5L, 3L, 1L, 3L, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA
),
grade_2 = c(
2L, 3L, 0L, 11L, 3L, 7L, 2L, 4L, 2L, 1L, 2L, 0L, 1L, 1L, 0L,
1L, 0L, 2L, 1L, 1L, 0L, 1L, 2L, 1L, 1L, 1L, 0L, NA, NA, NA, NA,
NA, NA, NA, NA, NA
),
grade_3 = c(
1L, 4L, 1L, 5L, 3L, 2L, 2L, 5L, 1L, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 4L, 5L, 1L, NA, NA, NA, 1L, 1L, 0L, 1L, 2L, 0L, 1L,
1L, 0L, 1L, 1L, 0L
),
grade_4 = c(
2L, 4L, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA
)
)
Now I'd like to sort the adverse events alphabetically but of course take the "related", "likely related" rows with the individual adverse event rows, so I'd like to somehow group them first.
In this example it's always 3 rows, but let's assume it could be sometimes 2, 4 or 5 rows too (all except the adverse event rows containing "related" in the string/name though e.g. 'unlikely related').
I know, I can get the indices of the adverse event rows by
grep('related', df$adverse_event, invert = T) but I'm unsure how to use this to group the rows together before sorting them.
Edit: Beginning of the left column of the desired output:
expected_output_left_column <- data.frame(adverse_event = c(
"Ascites", "related", "likely related",
"Biliary leakage / occlusion / fistula", "related", "likely related" ) )
Thank you!

Another solution using base r and lead function from dplyr
# where start each group
id <- grep('related', df$adverse_event, invert = T)
# size of each group
size <- lead(id) - id
size_of_last_group <- nrow(df) - id[length(id)] + 1
size[length(size)] <- size_of_last_group
# add col with id
df$id <- paste0(rep(df$adverse_event[id], times = size),
df$adverse_event)
# order
df <- df[order(df$id), ]
# remove id
df$id <- NULL

You can do the following:
library(dplyr)
left_join(
df,
df %>%
filter(!grepl('related',adverse_event)) %>%
select(adverse_event) %>%
arrange(adverse_event) %>%
mutate(o = row_number())
) %>%
mutate(o = data.table::nafill(o, "locf")) %>%
arrange(o) %>%
select(-o)
Output:
adverse_event grade_1 grade_2 grade_3 grade_4
1 Ascites NA 1 NA NA
2 related NA 1 NA NA
3 likely related NA 0 NA NA
4 Biliary leakage / occlusion / fistula NA NA 1 NA
5 related NA NA 2 NA
6 likely related NA NA 0 NA
7 Haemorrhage 4 2 1 2
8 related 4 3 4 4
9 likely related 0 0 1 1
10 Hyperbilirubinemia NA 1 NA NA
11 related NA 0 NA NA
12 likely related NA 2 NA NA
13 Liver abscess NA 1 4 NA
14 related NA 1 5 NA
15 likely related NA 0 1 NA
16 Other 3 11 5 NA
17 related 6 3 3 NA
18 likely related 1 7 2 NA
19 Pain 8 2 2 NA
20 related 4 4 5 NA
21 likely related 5 2 1 NA
22 Pleural effusion with drainage NA 1 NA NA
23 related NA 2 NA NA
24 likely related NA 1 NA NA
25 Pneumothorax NA 1 1 NA
26 related NA 1 1 NA
27 likely related NA 0 0 NA
28 Portal vein thrombosis NA NA 1 NA
29 related NA NA 1 NA
30 likely related NA NA 0 NA
31 Sepsis NA NA 1 NA
32 related NA NA 1 NA
33 likely related NA NA 0 NA
34 Subcapsular hematoma 3 1 NA NA
35 related 1 2 NA NA
36 likely related 3 0 NA NA
Note that this uses data.table::nafill().. A full data.table solution is as below:
library(data.table)
setDT(df)
data.table(adverse_event = sort(df[!grepl('related',adverse_event), adverse_event]))[, o:=.I][
df, on="adverse_event"][, o:=nafill(o, "locf")][order(o), !c("o")]

Add a "group" variable and sort
tmp=!grepl("related",df$adverse_event)
df$grp=cumsum(tmp)
df[order(match(df$grp,order(df$adverse_event[tmp]))),]
adverse_event grade_1 grade_2 grade_3 grade_4 grp
13 Ascites NA 1 NA NA 5
14 related NA 1 NA NA 5
15 likely related NA 0 NA NA 5
28 Biliary leakage / occlusion / fistula NA NA 1 NA 10
29 related NA NA 2 NA 10
30 likely related NA NA 0 NA 10
1 Haemorrhage 4 2 1 2 1
2 related 4 3 4 4 1
3 likely related 0 0 1 1 1
16 Hyperbilirubinemia NA 1 NA NA 6
17 related NA 0 NA NA 6
18 likely related NA 2 NA NA 6
19 Liver abscess NA 1 4 NA 7
20 related NA 1 5 NA 7
21 likely related NA 0 1 NA 7
4 Other 3 11 5 NA 2
5 related 6 3 3 NA 2
6 likely related 1 7 2 NA 2
7 Pain 8 2 2 NA 3
8 related 4 4 5 NA 3
9 likely related 5 2 1 NA 3
22 Pleural effusion with drainage NA 1 NA NA 8
23 related NA 2 NA NA 8
24 likely related NA 1 NA NA 8
25 Pneumothorax NA 1 1 NA 9
26 related NA 1 1 NA 9
27 likely related NA 0 0 NA 9
31 Portal vein thrombosis NA NA 1 NA 11
32 related NA NA 1 NA 11
33 likely related NA NA 0 NA 11
34 Sepsis NA NA 1 NA 12
35 related NA NA 1 NA 12
36 likely related NA NA 0 NA 12
10 Subcapsular hematoma 3 1 NA NA 4
11 related 1 2 NA NA 4
12 likely related 3 0 NA NA 4

Just to throw in another tidyverse solution:
library(tidyr)
library(dplyr)
df %>%
mutate(grp = if_else(grepl("related", adverse_event),
NA_character_,
adverse_event)) %>%
fill(grp) %>%
nest(data = -grp) %>%
arrange(grp) %>%
unnest(cols = data) %>%
select(-grp)
# # A tibble: 36 × 5
# adverse_event grade_1 grade_2 grade_3 grade_4
# <chr> <int> <int> <int> <int>
# 1 Ascites NA 1 NA NA
# 2 related NA 1 NA NA
# 3 likely related NA 0 NA NA
# 4 Biliary leakage / occlusion / fistula NA NA 1 NA
# 5 related NA NA 2 NA
# 6 likely related NA NA 0 NA
# 7 Haemorrhage 4 2 1 2
# 8 related 4 3 4 4
# 9 likely related 0 0 1 1
# 10 Hyperbilirubinemia NA 1 NA NA
# 11 related NA 0 NA NA
# 12 likely related NA 2 NA NA
# 13 Liver abscess NA 1 4 NA
# 14 related NA 1 5 NA
# 15 likely related NA 0 1 NA
# 16 Other 3 11 5 NA
# 17 related 6 3 3 NA
# 18 likely related 1 7 2 NA
# 19 Pain 8 2 2 NA
# 20 related 4 4 5 NA
# 21 likely related 5 2 1 NA
# 22 Pleural effusion with drainage NA 1 NA NA
# 23 related NA 2 NA NA
# 24 likely related NA 1 NA NA
# 25 Pneumothorax NA 1 1 NA
# 26 related NA 1 1 NA
# 27 likely related NA 0 0 NA
# 28 Portal vein thrombosis NA NA 1 NA
# 29 related NA NA 1 NA
# 30 likely related NA NA 0 NA
# 31 Sepsis NA NA 1 NA
# 32 related NA NA 1 NA
# 33 likely related NA NA 0 NA
# 34 Subcapsular hematoma 3 1 NA NA
# 35 related 1 2 NA NA
# 36 likely related 3 0 NA NA
Explanation
mutate + fill: Label each adverse_event with the stem, i.e. re-label all related records with the corresponding event above.
Nest all columns, but keep the newly created grp column, which bears the name of the stem adverse event.
Sort the adverse event stems.
Unnest the rows again.
Remove the grp column.

An approach using rank. Using an extended data set with 4 entries for "Ascites".
library(dplyr)
df %>%
mutate(ord = !grepl("related", adverse_event),
grp = cumsum(ord),
Rank = rank(adverse_event[ord])[grp]) %>%
arrange(Rank) %>%
select(-c(ord, grp, Rank))
adverse_event grade_1 grade_2 grade_3 grade_4
1 Ascites NA 1 NA NA
2 related NA 1 NA NA
3 related NA 1 NA NA
4 likely related NA 0 NA NA
5 Biliary leakage / occlusion / fistula NA NA 1 NA
6 related NA NA 2 NA
7 likely related NA NA 0 NA
8 Haemorrhage 4 2 1 2
9 related 4 3 4 4
10 likely related 0 0 1 1
11 Hyperbilirubinemia NA 1 NA NA
12 related NA 0 NA NA
13 likely related NA 2 NA NA
14 Liver abscess NA 1 4 NA
15 related NA 1 5 NA
16 likely related NA 0 1 NA
17 Other 3 11 5 NA
18 related 6 3 3 NA
19 likely related 1 7 2 NA
20 Pain 8 2 2 NA
21 related 4 4 5 NA
22 likely related 5 2 1 NA
23 Pleural effusion with drainage NA 1 NA NA
24 related NA 2 NA NA
25 likely related NA 1 NA NA
26 Pneumothorax NA 1 1 NA
27 related NA 1 1 NA
28 likely related NA 0 0 NA
29 Portal vein thrombosis NA NA 1 NA
30 related NA NA 1 NA
31 likely related NA NA 0 NA
32 Sepsis NA NA 1 NA
33 related NA NA 1 NA
34 likely related NA NA 0 NA
35 Subcapsular hematoma 3 1 NA NA
36 related 1 2 NA NA
37 likely related 3 0 NA NA
extended data
df <- structure(list(adverse_event = c("Haemorrhage", "related", "likely related",
"Other", "related", "likely related", "Pain", "related", "likely related",
"Subcapsular hematoma", "related", "likely related", "Ascites",
"related", "related", "likely related", "Hyperbilirubinemia",
"related", "likely related", "Liver abscess", "related", "likely related",
"Pleural effusion with drainage", "related", "likely related",
"Pneumothorax", "related", "likely related", "Biliary leakage / occlusion / fistula",
"related", "likely related", "Portal vein thrombosis", "related",
"likely related", "Sepsis", "related", "likely related"), grade_1 = c(4L,
4L, 0L, 3L, 6L, 1L, 8L, 4L, 5L, 3L, 1L, 3L, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), grade_2 = c(2L, 3L, 0L, 11L, 3L, 7L, 2L, 4L,
2L, 1L, 2L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 2L, 1L, 1L, 0L, 1L, 2L,
1L, 1L, 1L, 0L, NA, NA, NA, NA, NA, NA, NA, NA, NA), grade_3 = c(1L,
4L, 1L, 5L, 3L, 2L, 2L, 5L, 1L, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 4L, 5L, 1L, NA, NA, NA, 1L, 1L, 0L, 1L, 2L, 0L, 1L, 1L,
0L, 1L, 1L, 0L), grade_4 = c(2L, 4L, 1L, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA,
37L), class = "data.frame")

Here is a benchmark of the different suggestions if needed :
library(bench)
library(dplyr)
library(data.table)
library(tidyr)
df <- data.frame(
adverse_event = c(
"Haemorrhage", "related", "likely related",
"Other", "related", "likely related", "Pain", "related", "likely related",
"Subcapsular hematoma", "related", "likely related", "Ascites",
"related", "likely related", "Hyperbilirubinemia", "related",
"likely related", "Liver abscess", "related", "likely related",
"Pleural effusion with drainage", "related", "likely related",
"Pneumothorax", "related", "likely related", "Biliary leakage / occlusion / fistula",
"related", "likely related", "Portal vein thrombosis", "related",
"likely related", "Sepsis", "related", "likely related"
),
grade_1 = c(
4L, 4L, 0L, 3L, 6L, 1L, 8L, 4L, 5L, 3L, 1L, 3L, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA
),
grade_2 = c(
2L, 3L, 0L, 11L, 3L, 7L, 2L, 4L, 2L, 1L, 2L, 0L, 1L, 1L, 0L,
1L, 0L, 2L, 1L, 1L, 0L, 1L, 2L, 1L, 1L, 1L, 0L, NA, NA, NA, NA,
NA, NA, NA, NA, NA
),
grade_3 = c(
1L, 4L, 1L, 5L, 3L, 2L, 2L, 5L, 1L, NA, NA, NA, NA, NA, NA,
NA, NA, NA, 4L, 5L, 1L, NA, NA, NA, 1L, 1L, 0L, 1L, 2L, 0L, 1L,
1L, 0L, 1L, 1L, 0L
),
grade_4 = c(
2L, 4L, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA
)
)
paul_carteron <- function(df){
# where start each group
id <- grep('related', df$adverse_event, invert = T)
# size of each group
size <- lead(id) - id
size_of_last_group <- nrow(df) - id[length(id)] + 1
size[length(size)] <- size_of_last_group
# add col with id
df$id <- paste0(rep(df$adverse_event[id], times = size),
df$adverse_event)
# order
df <- df[order(df$id), ]
# remove id
df$id <- NULL
}
lang_tang_dplyr <- function(df){
left_join(
df,
df %>%
filter(!grepl('related', adverse_event)) %>%
select(adverse_event) %>%
arrange(adverse_event) %>%
mutate(o = row_number())
) %>%
mutate(o = data.table::nafill(o, "locf")) %>%
arrange(o) %>%
select(-o)
}
lang_tang_databable <- function(df) {
setDT(df)
data.table(adverse_event = sort(df[!grepl('related',adverse_event), adverse_event]))[, o:=.I][
df, on="adverse_event"][, o:=nafill(o, "locf")][order(o), !c("o")]
}
andre_wilberg <- function(df){
df %>%
mutate(ord = !grepl("related", adverse_event),
grp = cumsum(ord),
Rank = rank(adverse_event[ord])[grp]) %>%
arrange(Rank) %>%
select(-c(ord, grp, Rank))
}
thotal <- function(df){
df %>%
mutate(grp = if_else(grepl("related", adverse_event),
NA_character_,
adverse_event)) %>%
fill(grp) %>%
nest(data = -grp) %>%
arrange(grp) %>%
unnest(cols = data) %>%
select(-grp)
}
results = bench::mark(
iterations = 1000, check = FALSE, time_unit = "s", filter_gc = FALSE,
paul_carteron = paul_carteron(df),
lang_tang_dplyr = lang_tang_dplyr(df),
lang_tang_databable = lang_tang_databable(df),
andre_wilberg = andre_wilberg(df),
thotal = thotal(df)
)
plot(results)

Related

Impute missing values with a value from previous month (if exists)

I have a dataframe with more than 100 000 rows and 30 000 unique ids.
My aim is to fill all the NAs among the different columns if there is a value from the previous month and the same id. However, most of the times the previous recorded value is from more than a month ago. Those NAs I would like to leave untouched.
The id column and the date column do not have NAs.
Here is an example of the data I have:
df3
id oxygen gluco dias bp date
1 0,25897842 0,20201604 0,17955655 0,14100962 31.7.2019
2 NA NA 0,38582622 0,12918231 31.12.2014
2 0,35817147 0,32943499 NA 0,43667462 30.11.2018
2 0,68557053 0,42898807 0,93897514 NA 31.10.2018
2 NA NA 0,99899076 0,44168223 31.7.2018
2 0,43848054 0,38604586 NA NA 30.4.2013
2 0,15823254 0,06216771 0,07829624 0,69755251 31.1.2016
2 NA NA 0,61645303 NA 29.2.2016
2 0,94671363 0,50682091 0,96770222 0,97403356 31.5.2018
3 NA 0,77352235 0,660479 0,11554399 30.4.2019
3 0,15567703 NA 0,4553325 NA 31.3.2017
3 NA NA 0,22181609 0,08527658 30.9.2017
3 0,93660763 NA NA NA 31.3.2018
3 0,73416759 NA NA 0,78501791 30.11.2018
3 NA NA NA NA 28.2.2019
3 0,84525106 0,54360374 NA 0,40595426 31.8.2014
3 0,76221263 0,62983336 0,84592719 0,10640734 31.8.2013
4 NA 0,29108942 0,3863479 NA 31.1.2018
4 0,74075742 NA 0,38117415 0,58849266 30.11.2018
4 0,09400641 0,68860814 NA 0,88895224 31.8.2014
4 0,72202944 0,49901387 0,19967415 NA 31.8.2018
4 0,98205262 0,85213969 0,34450998 0,98962306 30.11.2013
This is the last code implementation that I have tried:
´´´
df3 %>%
group_by(id) %>%
mutate_all(funs(na.locf(., na.rm = FALSE, maxgap = 30)))
´´´
But apparently "mutate_all() ignored the following grouping variables:
Column id"
You can use the tidyverse for that. Here's an approach:
Change the date column to class Date, then order by date
Prepare the dates and remove the days in Ym
get the time difference in mo
flag the rows which have max one month difference
get groups by cumsum the inverse logic in flag
fill the rows from the same groups
library(dplyr)
library(tidyr)
library(lubridate)
df$date <- as.Date(df$date, format="%d.%m.%Y")
df %>%
arrange(date) %>%
mutate(
Ym = ym(strftime(date, "%Y-%m")),
mo = interval(Ym, lag(Ym, default=as.Date("1970-01-01"))) / months(1),
flag = cumsum(!(mo > -2 & mo < 1))) %>%
group_by(id, flag) %>%
fill(names(.), .direction="down") %>%
ungroup() %>%
select(-c("Ym","mo","flag")) %>%
print(n=nrow(.))
Output
# A tibble: 22 × 6
id oxygen gluco dias bp date
<int> <chr> <chr> <chr> <chr> <date>
1 2 0,43848054 0,38604586 NA NA 2013-04-30
2 3 0,76221263 0,62983336 0,84592719 0,10640734 2013-08-31
3 4 0,98205262 0,85213969 0,34450998 0,98962306 2013-11-30
4 3 0,84525106 0,54360374 NA 0,40595426 2014-08-31
5 4 0,09400641 0,68860814 NA 0,88895224 2014-08-31
6 2 NA NA 0,38582622 0,12918231 2014-12-31
7 2 0,15823254 0,06216771 0,07829624 0,69755251 2016-01-31
8 2 0,15823254 0,06216771 0,61645303 0,69755251 2016-02-29
9 3 0,15567703 NA 0,4553325 NA 2017-03-31
10 3 NA NA 0,22181609 0,08527658 2017-09-30
11 4 NA 0,29108942 0,3863479 NA 2018-01-31
12 3 0,93660763 NA NA NA 2018-03-31
13 2 0,94671363 0,50682091 0,96770222 0,97403356 2018-05-31
14 2 NA NA 0,99899076 0,44168223 2018-07-31
15 4 0,72202944 0,49901387 0,19967415 NA 2018-08-31
16 2 0,68557053 0,42898807 0,93897514 NA 2018-10-31
17 2 0,35817147 0,32943499 0,93897514 0,43667462 2018-11-30
18 3 0,73416759 NA NA 0,78501791 2018-11-30
19 4 0,74075742 NA 0,38117415 0,58849266 2018-11-30
20 3 NA NA NA NA 2019-02-28
21 3 NA 0,77352235 0,660479 0,11554399 2019-04-30
22 1 0,25897842 0,20201604 0,17955655 0,14100962 2019-07-31
Data
df <- structure(list(id = c(1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L), oxygen = c("0,25897842",
NA, "0,35817147", "0,68557053", NA, "0,43848054", "0,15823254",
NA, "0,94671363", NA, "0,15567703", NA, "0,93660763", "0,73416759",
NA, "0,84525106", "0,76221263", NA, "0,74075742", "0,09400641",
"0,72202944", "0,98205262"), gluco = c("0,20201604", NA, "0,32943499",
"0,42898807", NA, "0,38604586", "0,06216771", NA, "0,50682091",
"0,77352235", NA, NA, NA, NA, NA, "0,54360374", "0,62983336",
"0,29108942", NA, "0,68860814", "0,49901387", "0,85213969"),
dias = c("0,17955655", "0,38582622", NA, "0,93897514", "0,99899076",
NA, "0,07829624", "0,61645303", "0,96770222", "0,660479",
"0,4553325", "0,22181609", NA, NA, NA, NA, "0,84592719",
"0,3863479", "0,38117415", NA, "0,19967415", "0,34450998"
), bp = c("0,14100962", "0,12918231", "0,43667462", NA, "0,44168223",
NA, "0,69755251", NA, "0,97403356", "0,11554399", NA, "0,08527658",
NA, "0,78501791", NA, "0,40595426", "0,10640734", NA, "0,58849266",
"0,88895224", NA, "0,98962306"), date = structure(c(18108,
16435, 17865, 17835, 17743, 15825, 16831, 16860, 17682, 18016,
17256, 17439, 17621, 17865, 17955, 16313, 15948, 17562, 17865,
16313, 17774, 16039), class = "Date")), row.names = c(NA,
-22L), class = "data.frame")

Find columns that have identical values

Problem statement:
I actually want to eliminate from further analysis columns that have identical values in all cells. In order to do this, I want to find the columns that have identical values.
I wrote the following code which seems to be working for the dataframe test but not for the real dataframe stpo
library("dplyr")
library("purrr")
test_unique <- function(x)
{
return(length(unique(x)))
}
test <-data.frame(c1 = c("a", "a"), c2 = c(NA, NA), c3 = c(1,2), c4=c(NA, 4))
# What I want to find out the columns that have the same value throughout
res <- map(test[,c(names(test))], test_unique)
res
# But when I try to apply the same thing to the dataset below, it does not work.
# Not sure what the reason is. Is there a better way to do this? Perhaps using data.table? What am I doing wrong?
res2 <- map(stpo[,c(names(stpo))], test_unique)
res2
I am not exactly sure how to put the result of dput. I am putting this below (this is the dataframe stpo)
structure(list(stlnr = c(1L, 2L, 3L, 3L, 3L, 3L, 4L), stlkn = c(1L,
1L, 1L, 2L, 3L, 4L, 5L), stpoz = c(2L, 2L, 2L, 4L, 6L, 8L, 10L
), aennr = c(NA, NA, NA, NA, NA, NA, NA), vgknt = c(0L, 0L, 0L,
0L, 0L, 0L, 0L), idnrk = c("test_1", "test_1", "test_2", "test_3",
"test_3", "test_1", "test_2"), pswrk = c(NA, NA, NA, NA, NA,
NA, NA), meins = c("EA", "EA", "EA", "EA", "EA", "EA", "EA"),
menge = c(1, 14, 4, 4, 2, 2, 1), fmeng = c(NA, NA, NA, NA,
NA, NA, NA), ausch = c(0, 0, 0, 0, 0, 0, 0), avoau = c(0,
0, 0, 0, 0, 0, 0), netau = c(NA, NA, NA, NA, NA, NA, NA),
erskz = c(NA, NA, NA, NA, NA, NA, NA), rekri = c(NA, NA,
NA, NA, NA, NA, NA), rekrs = c(NA, NA, NA, NA, NA, NA, NA
), nlfzt = c(0L, 0L, 0L, 0L, 0L, 0L, 0L), verti = c(NA, NA,
NA, NA, NA, NA, NA), alpos = c(NA, NA, NA, NA, NA, NA, NA
), ewahr = c(0L, 0L, 0L, 0L, 0L, 0L, 0L), ekgrp = c(NA, NA,
NA, NA, NA, NA, NA), lifzt = c(0L, 0L, 0L, 0L, 0L, 0L, 0L
), lifnr = c(NA, NA, NA, NA, NA, NA, NA), roms1 = c(0, 0,
0, 0, 0, 0, 0), roms2 = c(0, 0, 0, 0, 0, 0, 0), roms3 = c(0,
0, 0, 0, 0, 0, 0), romen = c(0, 0, 0, 0, 0, 0, 0), rform = c(NA,
NA, NA, NA, NA, NA, NA), upskz = c(NA, NA, NA, NA, NA, NA,
NA), valkz = c(NA, NA, NA, NA, NA, NA, NA), matkl = c(NA,
NA, NA, NA, NA, NA, NA), webaz = c(0L, 0L, 0L, 0L, 0L, 0L,
0L), clobk = c(NA, NA, NA, NA, NA, NA, NA), lgort = c(NA,
NA, NA, NA, NA, NA, 14L), kzkup = c(NA, NA, NA, NA, NA, NA,
NA), dvnam = c(NA, NA, NA, NA, NA, NA, NA), dspst = c(NA,
NA, NA, NA, NA, NA, NA), alpst = c(NA, NA, NA, NA, NA, NA,
NA), alprf = c(0L, 0L, 0L, 0L, 0L, 0L, 0L), alpgr = c(NA,
NA, NA, NA, NA, NA, NA), kstty = c(NA, NA, NA, NA, NA, NA,
NA), kstnr = c(NA, NA, NA, NA, NA, NA, NA), nlfzv = c(0L,
0L, 0L, 0L, 0L, 0L, 0L), nlfmv = c(NA, NA, NA, NA, NA, NA,
NA), idhis = c(0L, 0L, 0L, 0L, 0L, 0L, 0L), idvar = c(NA,
NA, NA, NA, NA, NA, NA), itsob = c(NA, NA, NA, NA, NA, NA,
NA), cufactor = c(0L, 0L, 0L, 0L, 0L, 0L, 0L), funcid = c(NA,
NA, NA, NA, NA, NA, NA)), row.names = c(NA, -7L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x0000022534c51ef0>)
The issue is that we are subsetting on a data.table, rather than a data.frame. Here, we need with = FALSE (as mentioned in ?data.table
j - When with=TRUE (default), j is evaluated within the frame of the data.table; i.e., it sees column names as if they are variables.
stpo[,c(names(stpo))]
[1] "stlnr" "stlkn" "stpoz" "aennr" "vgknt" "idnrk" "pswrk" "meins" "menge" "fmeng" "ausch" "avoau" "netau" "erskz"
[15] "rekri" "rekrs" "nlfzt" "verti" "alpos" "ewahr" "ekgrp" "lifzt" "lifnr" "roms1" "roms2" "roms3" "romen" "rform"
[29] "upskz" "valkz" "matkl" "webaz" "clobk" "lgort" "kzkup" "dvnam" "dspst" "alpst" "alprf" "alpgr" "kstty" "kstnr"
[43] "nlfzv" "nlfmv" "idhis" "idvar" "itsob" "cufactor" "funcid"
Now, check the output of
stpo[,c(names(stpo)), with = FALSE]
stlnr stlkn stpoz aennr vgknt idnrk pswrk meins menge fmeng ausch avoau netau erskz rekri rekrs nlfzt verti alpos ewahr ekgrp lifzt lifnr roms1 roms2
1: 1 1 2 NA 0 test_1 NA EA 1 NA 0 0 NA NA NA NA 0 NA NA 0 NA 0 NA 0 0
2: 2 1 2 NA 0 test_1 NA EA 14 NA 0 0 NA NA NA NA 0 NA NA 0 NA 0 NA 0 0
3: 3 1 2 NA 0 test_2 NA EA 4 NA 0 0 NA NA NA NA 0 NA NA 0 NA 0 NA 0 0
4: 3 2 4 NA 0 test_3 NA EA 4 NA 0 0 NA NA NA NA 0 NA NA 0 NA 0 NA 0 0
5: 3 3 6 NA 0 test_3 NA EA 2 NA 0 0 NA NA NA NA 0 NA NA 0 NA 0 NA 0 0
6: 3 4 8 NA 0 test_1 NA EA 2 NA 0 0 NA NA NA NA 0 NA NA 0 NA 0 NA 0 0
7: 4 5 10 NA 0 test_2 NA EA 1 NA 0 0 NA NA NA NA 0 NA NA 0 NA 0 NA 0 0
roms3 romen rform upskz valkz matkl webaz clobk lgort kzkup dvnam dspst alpst alprf alpgr kstty kstnr nlfzv nlfmv idhis idvar itsob cufactor funcid
1: 0 0 NA NA NA NA 0 NA NA NA NA NA NA 0 NA NA NA 0 NA 0 NA NA 0 NA
2: 0 0 NA NA NA NA 0 NA NA NA NA NA NA 0 NA NA NA 0 NA 0 NA NA 0 NA
3: 0 0 NA NA NA NA 0 NA NA NA NA NA NA 0 NA NA NA 0 NA 0 NA NA 0 NA
4: 0 0 NA NA NA NA 0 NA NA NA NA NA NA 0 NA NA NA 0 NA 0 NA NA 0 NA
5: 0 0 NA NA NA NA 0 NA NA NA NA NA NA 0 NA NA NA 0 NA 0 NA NA 0 NA
6: 0 0 NA NA NA NA 0 NA NA NA NA NA NA 0 NA NA NA 0 NA 0 NA NA 0 NA
7: 0 0 NA NA NA NA 0 NA 14 NA NA NA NA 0 NA NA NA 0 NA 0 NA
Also, there is no need to do any subsetting if the whole columns are used, i.e. simply do
purrr::map(stpo, test_unique)
-output
$stlnr
[1] 4
$stlkn
[1] 5
$stpoz
[1] 5
...
...
Regarding the use of
stpo[,1:length(names(stpo))]
It seems to be a bug or a hackish way of dealing things instead of the standard option
If we want to eliminate columns having a single value, use var (assuming all numeric columns)
Filter(var, stpo)
stlnr stlkn stpoz menge
1: 1 1 2 1
2: 2 1 2 14
3: 3 1 2 4
4: 3 2 4 4
5: 3 3 6 2
6: 3 4 8 2
7: 4 5 10 1
Or change the function to return a logical output (it will also check for other type columns)
f1 <- function(x) length(unique(x)) > 1
Filter(f1, stpo)
-output
stlnr stlkn stpoz idnrk menge lgort
1: 1 1 2 test_1 1 NA
2: 2 1 2 test_1 14 NA
3: 3 1 2 test_2 4 NA
4: 3 2 4 test_3 4 NA
5: 3 3 6 test_3 2 NA
6: 3 4 8 test_1 2 NA
7: 4 5 10 test_2 1 14
Or use the data.table way of subsetting the columns
stpo[, .SD, .SDcols = f1]
stlnr stlkn stpoz idnrk menge lgort
1: 1 1 2 test_1 1 NA
2: 2 1 2 test_1 14 NA
3: 3 1 2 test_2 4 NA
4: 3 2 4 test_3 4 NA
5: 3 3 6 test_3 2 NA
6: 3 4 8 test_1 2 NA
7: 4 5 10 test_2 1 14
Looks like I have taken a cue from what Arun wrote and modified the code like so:
res2 <- map(stpo[,1:length(names(stpo))], test_unique)

How to remove NA in character data in R

I would like to copy the last two columns from each month to the beginning of the next month. I did it as follows (below), but the data contains NA and when I change it to character, the program breaks down. How do I copy columns to keep their type?
My code:
library(readxl)
library(tibble)
df<- read_excel("C:/Users/Rezerwa/Documents/Database.xlsx")
df=add_column(df, Feb1 = as.character(do.call(paste0, df["January...4"])), .after = "January...5")
df=add_column(df, Feb2 = as.numeric(do.call(paste0, df["January...5"])), .after = "Feb1")
My data:
df
# A tibble: 10 x 13
Product January...2 January...3 January...4 January...5 February...6 February...7 February...8 February...9 March...10 March...11 March...12 March...13
<chr> <lgl> <lgl> <chr> <dbl> <chr> <dbl> <chr> <dbl> <chr> <dbl> <chr> <dbl>
1 a NA NA 754.00 4 754.00 4 754.00 4 754.00 4 754.00 4
2 b NA NA 706.00 3 706.00 3 706.00 3 706.00 3 706.00 3
3 c NA NA 517.00 3 517.00 3 517.00 3 517.00 3 517.00 3
4 d NA NA 1466.00 9 1466.00 9 1466.00 9 1466.00 9 1466.00 9
5 e NA NA 543.00 8 543.00 8 543.00 8 543.00 8 543.00 8
6 f NA NA NA NA NA NA NA NA NA NA NA NA
7 g NA NA NA NA NA NA NA NA NA NA NA NA
8 h NA NA NA NA NA NA NA NA NA NA NA NA
9 i NA NA 1466.00 8 NA NA NA NA NA NA NA NA
10 j NA NA NA NA 543.00 3 NA NA NA NA NA NA
My error:
> df=add_column(df, Feb1 = as.character(do.call(paste0, df["January...4"])), .after = "January...5")
> df=add_column(df, Feb2 = as.numeric(do.call(paste0, df["January...5"])), .after = "Feb1")
Warning message:
In eval_tidy(xs[[i]], unique_output) : NAs introduced by coercion
Using base R we can split the columns based on the prefix of their names, select last two columns from each group and cbind to original df.
df1 <- cbind(df, do.call(cbind, lapply(split.default(df[-1],
sub("\\..*", "", names(df)[-1])), function(x) {n <- ncol(x);x[, c(n-1, n)]})))
To get data in order, we can do
cbind(df1[1], df1[-1][order(match(sub("\\..*", "", names(df1)[-1]), month.name))])
data
df <- structure(list(Product = structure(1:10, .Label = c("a", "b",
"c", "d", "e", "f", "g", "h", "i", "j"), class = "factor"), January...2 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA), January...3 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), January...4 = c(754, 706, 517,
1466, 543, NA, NA, NA, 1466, NA), January...5 = c(4L, 3L, 3L,
9L, 8L, NA, NA, NA, 8L, NA), February...6 = c(754, 706, 517,
1466, 543, NA, NA, NA, NA, 543), February...7 = c(4L, 3L, 3L,
9L, 8L, NA, NA, NA, NA, 3L), February...8 = c(754, 706, 517,
1466, 543, NA, NA, NA, NA, NA), February...9 = c(4L, 3L, 3L,
9L, 8L, NA, NA, NA, NA, NA), March...10 = c(754, 706, 517, 1466,
543, NA, NA, NA, NA, NA), March...11 = c(4L, 3L, 3L, 9L, 8L,
NA, NA, NA, NA, NA), March...12 = c(754, 706, 517, 1466, 543,
NA, NA, NA, NA, NA), March...13 = c(4L, 3L, 3L, 9L, 8L, NA, NA,
NA, NA, NA)), class = "data.frame", row.names = c("1", "2", "3",
"4", "5", "6", "7", "8", "9", "10"))

divide max value in col by sum of values in row

I have a matrix and my objective is to find the maximum of each column and then to divide that number by the sum of all values in the row which contains the max of that column. In other words
max(y) / sum of values in the row where y is the max
How would apply this formula to every column in R ?
> the_matrix
Source: local data frame [20 x 10]
type 100 100F 100I 100X 101 102 1028P 103 103D
(fctr) (int) (int) (int) (int) (int) (int) (int) (int) (int)
1 0 NA NA NA NA NA NA NA NA NA
2 0A 2 NA NA NA NA NA NA NA NA
3 0B NA NA NA NA NA NA NA NA NA
4 0C NA NA NA NA NA NA NA NA NA
5 0E NA NA NA NA NA NA NA NA NA
6 0G NA NA NA NA NA NA NA NA NA
7 0O NA NA NA NA NA NA NA NA NA
8 0Z NA NA NA NA NA NA NA NA NA
9 1 2 NA NA NA NA NA NA NA NA
10 1A 3968 NA 214 26 4 289 8 56030 7484
11 1B 172 NA 107 NA NA 2 NA 372 3829
12 1C 584 NA 19 NA NA 1 NA 72951 363
13 1D 27 NA NA NA NA NA NA 365 22
14 1E 27944 16 68 NA NA NA 1 62 12
15 1F 1 NA 1 NA NA 1 NA 368 27
16 1G 4 NA NA NA NA NA NA 7 NA
17 1H 65 NA 6 21 1 6 3 714 59
18 1M NA NA NA NA NA NA NA 1 NA
19 1N NA NA NA NA NA NA NA NA NA
20 1Q NA NA NA NA NA NA NA NA NA
> dput(the_matrix)
structure(list(type = structure(1:20, .Label = c("0", "0A", "0B",
"0C", "0E", "0G", "0O", "0Z", "1", "1A", "1B", "1C", "1D", "1E",
"1F", "1G", "1H", "1M", "1N", "1Q", "1S", "1X", "1Z", "2", "2A",
"2B", "2C", "2D", "2E", "2F", "2G", "2H", "2I", "2J", "2M", "2S",
"2T", "2X", "2Z", "3", "3B", "3C", "3E", "4B", "5H", "8Z", "0H",
"1I", "1R", "2N", "3H", "5D", "0D", "1K", "1P", "1T", "1U", "1V",
"1W", "1Y", "2U", "3A", "4A", "5C", "7H", "9", "0F", "0T", "1J",
"2L", "0W", "2Q", "3G"), class = "factor"), `100` = c(NA, 2L,
NA, NA, NA, NA, NA, NA, 2L, 3968L, 172L, 584L, 27L, 27944L, 1L,
4L, 65L, NA, NA, NA), `100F` = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, 16L, NA, NA, NA, NA, NA, NA), `100I` = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, 214L, 107L, 19L, NA, 68L, 1L,
NA, 6L, NA, NA, NA), `100X` = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, 26L, NA, NA, NA, NA, NA, NA, 21L, NA, NA, NA), `101` = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, 4L, NA, NA, NA, NA, NA, NA, 1L,
NA, NA, NA), `102` = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 289L,
2L, 1L, NA, NA, 1L, NA, 6L, NA, NA, NA), `1028P` = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, 8L, NA, NA, NA, 1L, NA, NA, 3L, NA,
NA, NA), `103` = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 56030L,
372L, 72951L, 365L, 62L, 368L, 7L, 714L, 1L, NA, NA), `103D` = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, 7484L, 3829L, 363L, 22L, 12L,
27L, NA, 59L, NA, NA, NA)), .Names = c("type", "100", "100F",
"100I", "100X", "101", "102", "1028P", "103", "103D"), class = c("tbl_df",
"data.frame"), row.names = c(NA, -20L))
Going step-by-step:
# let's not call a data frame a matrix
real_matrix = as.matrix(the_matrix[, -1])
# max of each column
col_max = apply(real_matrix, 2, max, na.rm = T)
# which row contains the max
col_which_max = apply(real_matrix, 2, which.max)
# row totals
row_total = rowSums(real_matrix, na.rm = T)
# col max divided by row total for corresponding row
col_max / row_total[col_which_max]
Rounded to 3 decimals, this yields the following:
100 100F 100I 100X 101 102 1028P 103 103D
0.994 0.001 0.003 0.000 0.000 0.004 0.000 0.987 0.110

Breaking the tapply junkie habit

I've learned R by toying, and I'm starting to think that I'm abusing the tapply function. Are there better ways to do some of the following actions? Granted, they work, but as they get more complex I wonder if I'm losing out on better options. I'm looking for some criticism, here:
tapply(var1, list(fac1, fac2), mean, na.rm=T)
tapply(var1, fac1, sum, na.rm=T) / tapply(var2, fac1, sum, na.rm=T)
cumsum(tapply(var1, fac1, sum, na.rm=T)) / sum(var1)
Update: Here's some example data...
var1 var2 fac1 fac2
1 NA 275.54 10 (266,326]
2 NA 565.89 10 (552,818]
3 NA 815.41 6 (552,818]
4 NA 281.77 6 (266,326]
5 NA 640.24 NA (552,818]
6 NA 78.42 NA [78.4,266]
7 NA 1027.06 NA (818,1.55e+03]
8 NA 355.20 NA (326,552]
9 NA 464.52 NA (326,552]
10 NA 1397.11 10 (818,1.55e+03]
11 NA 229.82 NA [78.4,266]
12 NA 542.77 NA (326,552]
13 NA 829.32 NA (818,1.55e+03]
14 NA 284.78 NA (266,326]
15 NA 194.97 10 [78.4,266]
16 NA 672.55 8 (552,818]
17 NA 348.01 10 (326,552]
18 NA 1550.79 9 (818,1.55e+03]
19 101.98 101.98 4 [78.4,266]
20 NA 292.80 6 (266,326]
Update data dump:
structure(list(var1 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, 101.98, NA), var2 = c(275.54,
565.89, 815.41, 281.77, 640.24, 78.42, 1027.06, 355.2, 464.52,
1397.11, 229.82, 542.77, 829.32, 284.78, 194.97, 672.55, 348.01,
1550.79, 101.98, 292.8), fac1 = c(10L, 10L, 6L, 6L, NA, NA, NA,
NA, NA, 10L, NA, NA, NA, NA, 10L, 8L, 10L, 9L, 4L, 6L), fac2 = structure(c(2L,
4L, 4L, 2L, 4L, 1L, 5L, 3L, 3L, 5L, 1L, 3L, 5L, 2L, 1L, 4L, 3L,
5L, 1L, 2L), .Label = c("[78.4,266]", "(266,326]", "(326,552]",
"(552,818]", "(818,1.55e+03]"), class = "factor")), .Names = c("var1",
"var2", "fac1", "fac2"), row.names = c(NA, -20L), class = "data.frame")
For part 1 I prefer aggregate because it keeps the data in a more R-like one observation per row format.
aggregate(var1, list(fac1, fac2), mean, na.rm=T)

Resources