replacing NA values using stringr - r

My data looks as follows:
var1 var2 var3
1 9V .6V 77V
2 6V .3V 15V
3 9V .7V 114V
4 12V 1.0V 199V
5 14V 1.2V 245V
6 13V .8V 158V
7 11V .6V 136V
8 11V .7V 132V
9 12V .8V 171V
10 11V .7V 155V
11 13V .8V 166V
12 11V .7V 138V
13 11V .9V 173V
14 9V .8V 143V
15 8V .7V 105V
16 4V .4V 21V
17 8V .4V 26V
18 10V .8V 154V
19 9V .8V 130V
20 10V .7V 113V
21 10V .6V 102V
22 11V .8V 135V
23 9V .7V 120V
24 10V .7V 126V
25 7N .6N 124N
26 14N 1.1N 210N
The last 2 rows contain N. I am trying to set to NA these observations which contain N.
I am trying some combination of str_detect and str_replace but I cannot seem to get it working.
Additionally I have other (very rarely) letters, i.e. M and P - I would like to find a way to set if the observations contains one of these letters, then set that observation to NA. i.e. c(var1:var3) %in% str_detect(c("N", "M", "P"))... str_replace_all.
Data:
structure(list(var1 = c("9V", "6V", "9V", "12V", "14V", "13V",
"11V", "11V", "12V", "11V", "13V", "11V", "11V", "9V", "8V",
"4V", "8V", "10V", "9V", "10V", "10V", "11V", "9V", "10V", "7N",
"14N", "7V", "5V", "7V", "9V", "8V", "8V", "5V", "4V", "4V",
"5V", "7V", "5V", "6V", "8V", "9V", "6V", "6V", "7V", "8V", "7V",
"8V", "8V", "7V", "8V"), var2 = c(".6V", ".3V", ".7V", "1.0V",
"1.2V", ".8V", ".6V", ".7V", ".8V", ".7V", ".8V", ".7V", ".9V",
".8V", ".7V", ".4V", ".4V", ".8V", ".8V", ".7V", ".6V", ".8V",
".7V", ".7V", ".6N", "1.1N", ".4V", ".3V", ".4V", ".6V", ".5V",
".6V", ".4V", ".3V", ".2V", ".3V", ".4V", ".3V", ".3V", ".5V",
".6V", ".4V", ".4V", ".4V", ".5V", ".4V", ".4V", ".5V", ".4V",
".4V"), var3 = c("77V", "15V", "114V", "199V", "245V", "158V",
"136V", "132V", "171V", "155V", "166V", "138V", "173V", "143V",
"105V", "21V", "26V", "154V", "130V", "113V", "102V", "135V",
"120V", "126V", "124N", "210N", "35V", "9V", "48V", "91V", "81V",
"80V", "14V", "11V", "7V", "13V", "34V", "18V", "15V", "58V",
"76V", "29V", "30V", "31V", "32V", "34V", "57V", "58V", "52V",
"49V")), row.names = c(NA, 50L), class = "data.frame")

Here's one solution:
x[] <- lapply(x, function(s) ifelse(grepl("N$", s), NA_character_, s))
x
# var1 var2 var3
# 1 9V .6V 77V
# 2 6V .3V 15V
# 3 9V .7V 114V
# 4 12V 1.0V 199V
# 5 14V 1.2V 245V
# 6 13V .8V 158V
# 7 11V .6V 136V
# 8 11V .7V 132V
# 9 12V .8V 171V
# 10 11V .7V 155V
# 11 13V .8V 166V
# 12 11V .7V 138V
# 13 11V .9V 173V
# 14 9V .8V 143V
# 15 8V .7V 105V
# 16 4V .4V 21V
# 17 8V .4V 26V
# 18 10V .8V 154V
# 19 9V .8V 130V
# 20 10V .7V 113V
# 21 10V .6V 102V
# 22 11V .8V 135V
# 23 9V .7V 120V
# 24 10V .7V 126V
# 25 <NA> <NA> <NA>
# 26 <NA> <NA> <NA>
# 27 7V .4V 35V
# 28 5V .3V 9V
# 29 7V .4V 48V
# 30 9V .6V 91V
# 31 8V .5V 81V
# 32 8V .6V 80V
# 33 5V .4V 14V
# 34 4V .3V 11V
# 35 4V .2V 7V
# 36 5V .3V 13V
# 37 7V .4V 34V
# 38 5V .3V 18V
# 39 6V .3V 15V
# 40 8V .5V 58V
# 41 9V .6V 76V
# 42 6V .4V 29V
# 43 6V .4V 30V
# 44 7V .4V 31V
# 45 8V .5V 32V
# 46 7V .4V 34V
# 47 8V .4V 57V
# 48 8V .5V 58V
# 49 7V .4V 52V
# 50 8V .4V 49V
If your data has columns where you do not want to do this replacement, just use a subset:
x[2:3] <- lapply(x[2:3], ...)
Variant:
library(dplyr)
x %>%
mutate_at(vars(var1, var2, var3), ~ if_else(grepl("N$", .), NA_character_, .))
# or, if all columns
x %>%
mutate_all(~ if_else(grepl("N$", .), NA_character_, .))
The use of NA_character_ is two-fold:
In the base R version, it is just being declarative, saying that I intend for the result to always be character;
In the dplyr version, its between function requires that the class of both the "yes" and "no" arguments be the same, and class(NA) is not class("A").

You just need to change your pattern to "N|M|P" :
dat <- structure(list(var1 = c("9V", "6V", "9V", "12V", "14V", "13V",
"11V", "11V", "12V", "11V", "13V", "11V", "11V", "9V", "8V",
"4V", "8V", "10V", "9V", "10V", "10V", "11V", "9V", "10V", "7N",
"14N", "7V", "5V", "7V", "9V", "8V", "8V", "5V", "4V", "4V",
"5V", "7V", "5V", "6V", "8V", "9V", "6V", "6V", "7V", "8V", "7V",
"8V", "8V", "7V", "8V"), var2 = c(".6V", ".3V", ".7V", "1.0V",
"1.2V", ".8V", ".6V", ".7V", ".8V", ".7V", ".8V", ".7V", ".9V",
".8V", ".7V", ".4V", ".4V", ".8V", ".8V", ".7V", ".6V", ".8V",
".7V", ".7V", ".6N", "1.1N", ".4V", ".3V", ".4V", ".6V", ".5V",
".6V", ".4V", ".3V", ".2V", ".3V", ".4V", ".3V", ".3V", ".5V",
".6V", ".4V", ".4V", ".4V", ".5V", ".4V", ".4V", ".5V", ".4V",
".4V"), var3 = c("77V", "15V", "114V", "199V", "245V", "158V",
"136V", "132V", "171V", "155V", "166V", "138V", "173V", "143V",
"105V", "21V", "26V", "154V", "130V", "113V", "102V", "135V",
"120V", "126V", "124N", "210N", "35V", "9V", "48V", "91V", "81V",
"80V", "14V", "11V", "7V", "13V", "34V", "18V", "15V", "58V",
"76V", "29V", "30V", "31V", "32V", "34V", "57V", "58V", "52V",
"49V")), row.names = c(NA, 50L), class = "data.frame")
library(stringr)
library(dplyr)
dat %>% mutate(var3 = str_replace_all(var3, c("N|M|P"), replacement = NA_character_))

The dplyr-stringr solution that you were trying to figure out would be like below:
library(stringr)
library(dplyr)
df1 %>%
mutate_at(vars(var1:var3),
list(~str_replace_all(., "N$|M$|P$", replacement = NA_character_)))
#> var1 var2 var3
#> 1 9V .6V 77V
#> 2 6V .3V 15V
#> 3 9V .7V 114V
#> 4 12V 1.0V 199V
#> 5 14V 1.2V 245V
## ...
#> 20 10V .7V 113V
#> 21 10V .6V 102V
#> 22 11V .8V 135V
#> 23 9V .7V 120V
#> 24 10V .7V 126V
#> 25 <NA> <NA> <NA>
#> 26 <NA> <NA> <NA>
#> 27 7V .4V 35V
#> 28 5V .3V 9V
#> 29 7V .4V 48V
#> 30 9V .6V 91V
## ...
#> 45 8V .5V 32V
#> 46 7V .4V 34V
#> 47 8V .4V 57V
#> 48 8V .5V 58V
#> 49 7V .4V 52V
#> 50 8V .4V 49V

Related

How to build a real-time dataframe in R?

Let's say I have two dataframes like the ones below:
df1 = structure(list(Date = c("2000-01-05", "2000-02-03", "2000-03-02",
"2000-03-30", "2000-04-13", "2000-05-11", "2000-06-08", "2000-07-06",
"2000-09-14", "2000-10-19", "2000-11-02", "2000-12-14", "2001-02-01",
"2001-03-01", "2001-04-11", "2001-05-10", "2001-06-07", "2001-06-21",
"2001-07-05", "2001-08-30", "2001-10-11", "2001-11-08", "2001-12-06"
)), row.names = c(NA, 23L), class = "data.frame")
Date
1 2000-01-05
2 2000-02-03
3 2000-03-02
4 2000-03-30
5 2000-04-13
6 2000-05-11
7 2000-06-08
8 2000-07-06
9 2000-09-14
10 2000-10-19
11 2000-11-02
12 2000-12-14
13 2001-02-01
14 2001-03-01
15 2001-04-11
16 2001-05-10
17 2001-06-07
18 2001-06-21
19 2001-07-05
20 2001-08-30
21 2001-10-11
22 2001-11-08
23 2001-12-06
df2 = structure(list(Date = structure(c(10987, 11016, 11047, 11077,
11108, 11138, 11169, 11200, 11230, 11261, 11291, 11322, 11353,
11381, 11412, 11442, 11473, 11503, 11534, 11565, 11595, 11626,
11656, 11687), class = "Date"), x = c(3.04285714285714, 3.27571428571429,
3.5104347826087, 3.685, 3.92, 4.29454545454545, 4.30857142857143,
4.41913043478261, 4.59047619047619, 4.76272727272727, 4.82909090909091,
4.82684210526316, 4.75590909090909, 4.9925, 4.78136363636364,
5.06421052631579, 4.65363636363636, 4.53952380952381, 4.50545454545454,
4.49130434782609, 3.9865, 3.97130434782609, 3.50727272727273,
3.33888888888889)), row.names = c(NA, 24L), class = "data.frame")
Date x
1 2000-01-31 3.042857
2 2000-02-29 3.275714
3 2000-03-31 3.510435
4 2000-04-30 3.685000
5 2000-05-31 3.920000
6 2000-06-30 4.294545
7 2000-07-31 4.308571
8 2000-08-31 4.419130
9 2000-09-30 4.590476
10 2000-10-31 4.762727
11 2000-11-30 4.829091
12 2000-12-31 4.826842
13 2001-01-31 4.755909
14 2001-02-28 4.992500
15 2001-03-31 4.781364
16 2001-04-30 5.064211
17 2001-05-31 4.653636
18 2001-06-30 4.539524
19 2001-07-31 4.505455
20 2001-08-31 4.491304
21 2001-09-30 3.986500
22 2001-10-31 3.971304
23 2001-11-30 3.507273
24 2001-12-31 3.338889
Now, what I would like to do is to create a real-time dataframe, that is, the data in df2 that were only available at the time of df1. For instance, at 2000-01-05 (first row in df1) no data in df2 was available since since 2000-01-31 (first row of df2) occurs after 2000-01-05. However, in 2000-02-03(second row in df1) the observation in 2000-01-31 (first row of df2) is available. This should be the reasoning for every row. The outcome should look like this:
Date y
1 2000-01-05 NA
2 2000-02-03 3.042857
3 2000-03-02 3.275714
4 2000-03-30 3.275714
5 2000-04-13 3.510435
6 2000-05-11 3.685000
....
The rule would be: pick up from df2 only the observation that was available at the time of df1.
Can anyone help me?
Thanks!
What you can do is complete the df2 dates and then join.
library(dplyr)
library(tidyr)
# create a dataframe with all the days, not just the snapshots
df2_complete <- df2 %>%
complete(Date = seq.Date(min(Date), max(Date), by = "day")) %>%
fill(x, .direction = "down")
# convert to Date class for this case and join
df1 %>%
mutate(Date = as.Date(Date)) %>%
left_join(df2_complete, by = "Date")
Which gives:
Date x
1 2000-01-05 NA
2 2000-02-03 3.042857
3 2000-03-02 3.275714
4 2000-03-30 3.275714
5 2000-04-13 3.510435
6 2000-05-11 3.685000
....

Imputing date based on next(or previous) available date grouped by another column

I have a dataframe that looks like this:
CYCLE date_cycle Randomization_Date COUPLEID
1 0 2016-02-16 10892
2 1 2016-08-17 2016-02-19 10894
3 1 2016-08-14 2016-02-26 10899
4 1 2016-02-26 10900
5 2 2016-03--- 2016-02-26 10900
6 3 2016-07-19 2016-02-26 10900
7 4 2016-11-15 2016-02-26 10900
8 1 2016-02-27 10901
9 2 2016-02--- 2016-02-27 10901
10 1 2016-03-27 2016-03-03 10902
11 2 2016-04-21 2016-03-03 10902
12 1 2016-03-03 10903
13 2 2016-03--- 2016-03-03 10903
14 0 2016-03-03 10904
15 1 2016-03-03 10905
16 2 2016-03-03 10905
17 3 2016-03-03 10905
18 4 2016-04-14 2016-03-03 10905
19 5 2016-05--- 2016-03-03 10905
20 6 2016-06--- 2016-03-03 10905
The goal is to fill in the missing day for a given ID using either an earlier or later date and add/subtract 28 from that.
The date_cycle variable was originally in the dataframe as a character type.
I have tried to code it as follows:
mutate(rowwise(df),
newdate = case_when( str_count(date1, pattern = "\\W") >2 ~ lag(as.Date.character(date1, "%Y-%m-%d"),1) + days(28)))
But I need to incorporate it by ID by CYCLE.
An example of my data could be made like this:
data.frame(stringsAsFactors = FALSE,
CYCLE =(0,1,1,1,2,3,4,1,2,1,2,1,2,0,1,2,3,4,5,6),
date_cycle = c(NA,"2016-08-17", "2016-08-14",NA,"2016-03---","2016-07-19", "2016-11-15",NA,"2016-02---", "2016-03-27","2016-04-21",NA, "2016-03---",NA,NA,NA,NA,"2016-04-14", "2016-05---","2016-06---"), Randomization_Date = c("2016-02-16","2016-02-19",
"2016-02-26","2016-02-26",
"2016-02-26","2016-02-26",
"2016-02-26",
"2016-02-27","2016-02-27",
"2016-03-03",
"2016-03-03","2016-03-03",
"2016-03-03","2016-03-03",
"2016-03-03",
"2016-03-03","2016-03-03",
"2016-03-03",
"2016-03-03","2016-03-03"),
COUPLEID = c(10892,10894,10899,10900,
10900,10900,10900,10901,10901,
10902,10902,10903,10903,10904,
10905,10905,10905,10905,10905,10905)
)
The output I am after would look like:
COUPLEID CYCLE date_cycle new_date_cycle
a 1 2014-03-27 2014-03-27
a 1 2014-04--- 2014-04-24
b 1 2014-03-24 2014-03-24
b 2 2014-04-21
b 3 2014-05--- 2014-05-19
c 1 2014-04--- 2014-04-02
c 2 2014-04-30 2014-04-30
I have also started to make a long conditional, but I wanted to ask here and see if anyone new of a more straight forward way to do it, instead of explicitly writing out all of the possible conditions.
mutate(rowwise(df),
newdate = case_when(
grp == 1 & str_count(date1, pattern = "\\W") >2 & !is.na(lead(date1,1) ~ lead(date1,1) - days(28),
grp == 2 & str_count(date1, pattern = "\\W") >2 & !is.na(lead(date1,1)) ~ lead(date1,1) - days(28),
grp == 3 & str_count(date1, pattern = "\\W") >2 & ...)))
Function to fill dates forward and backwards
filldates <- function(dates) {
m = which(!is.na(dates))
if(length(m)>0 & length(m)!=length(dates)) {
if(m[1]>1) for(i in seq(m,1,-1)) if(is.na(dates[i])) dates[i]=dates[i+1]-28
if(sum(is.na(dates))>0) for(i in seq_along(dates)) if(is.na(dates[i])) dates[i] = dates[i-1]+28
}
return(dates)
}
Usage:
data %>%
arrange(ID, grp) %>%
group_by(ID) %>%
mutate(date2=filldates(as.Date(date1,"%Y-%m-%d")))
Ouput:
ID grp date1 date2
<chr> <dbl> <chr> <date>
1 a 1 2014-03-27 2014-03-27
2 a 2 2014-04--- 2014-04-24
3 b 1 2014-03-24 2014-03-24
4 b 2 2014-04--- 2014-04-21
5 b 3 2014-05--- 2014-05-19
6 c 1 2014-03--- 2014-04-02
7 c 2 2014-04-30 2014-04-30
An option using purrr::accumulate().
library(tidyverse)
center <- df %>%
group_by(ID) %>%
mutate(helpDate = ymd(str_replace(date1, '---', '-01')),
refDate = max(ymd(date1), na.rm = T))
backward <- center %>%
filter(refDate == max(helpDate)) %>%
mutate(date2 = accumulate(refDate, ~ . - days(28), .dir = 'backward'))
forward <- center %>%
filter(refDate == min(helpDate)) %>%
mutate(date2 = accumulate(refDate, ~ . + days(28)))
bind_rows(forward, backward) %>%
ungroup() %>%
mutate(date2 = as_date(date2)) %>%
select(-c('helpDate', 'refDate'))
# # A tibble: 7 x 4
# ID grp date1 date2
# <chr> <int> <chr> <date>
# 1 a 1 2014-03-27 2014-03-27
# 2 a 2 2014-04--- 2014-04-24
# 3 b 1 2014-03-24 2014-03-24
# 4 b 2 2014-04--- 2014-04-21
# 5 b 3 2014-05--- 2014-05-19
# 6 c 1 2014-03--- 2014-04-02
# 7 c 2 2014-04-30 2014-04-30

Mean function R with missing values, loop on 5 rows

I would like to calculate mean every 5 rows in my df. Here is my df :
Time
value
03/06/2021 06:15:00
NA
03/06/2021 06:16:00
NA
03/06/2021 06:17:00
20
03/06/2021 06:18:00
22
03/06/2021 06:19:00
25
03/06/2021 06:20:00
NA
03/06/2021 06:21:00
31
03/06/2021 06:22:00
23
03/06/2021 06:23:00
19
03/06/2021 06:24:00
25
03/06/2021 06:25:00
34
03/06/2021 06:26:00
42
03/06/2021 06:27:00
NA
03/06/2021 06:28:00
19
03/06/2021 06:29:00
17
03/06/2021 06:30:00
25
I already have a loop which goes well to calculate means for each 5 rows package. My problem is in my "mean function".
The problem is :
-if I put na.rm = FALSE, mean = NA as soon as there is a NA in a package of 5 values.
- if I put na.rm = TRUE in mean function, the result gives me averages that are shifted to take 5 values. I would like the NA not to interfere with the average and that when there is a NA in a package of 5 values, the average is only done on 4 values.
How can I do this? Thanks for your help !
You can solve your problem by introducing a dummy variable that groups your observarions in sets of five and then calculating the mean within group. Here's MWE, based in the tidyverse, that assumes your data is in a data.frame named df.
library(tidyverse)
df %>%
mutate(Group= 1 + floor((row_number()-1) / 5)) %>%
group_by(Group) %>%
summarise(Mean=mean(value, na.rm=TRUE), .groups="drop")
# A tibble: 4 × 2
Group Mean
<dbl> <dbl>
1 1 22.3
2 2 24.5
3 3 28
4 4 25
A solution based on purrr::map_dfr:
library(purrr)
df <- data.frame(
stringsAsFactors = FALSE,
time = c("03/06/2021 06:15:00","03/06/2021 06:16:00",
"03/06/2021 06:17:00",
"03/06/2021 06:18:00","03/06/2021 06:19:00",
"03/06/2021 06:20:00","03/06/2021 06:21:00",
"03/06/2021 06:22:00","03/06/2021 06:23:00",
"03/06/2021 06:24:00","03/06/2021 06:25:00",
"03/06/2021 06:26:00",
"03/06/2021 06:27:00","03/06/2021 06:28:00",
"03/06/2021 06:29:00","03/06/2021 06:30:00"),
value = c(NA,NA,20L,22L,
25L,NA,31L,23L,19L,25L,34L,42L,NA,19L,17L,
25L)
)
map_dfr(1:(nrow(df)-5),
~ data.frame(Group =.x, Mean = mean(df$value[.x:(.x+5)],na.rm=T)))
#> Group Mean
#> 1 1 22.33333
#> 2 2 24.50000
#> 3 3 24.20000
#> 4 4 24.00000
#> 5 5 24.60000
#> 6 6 26.40000
#> 7 7 29.00000
#> 8 8 28.60000
#> 9 9 27.80000
#> 10 10 27.40000
#> 11 11 27.40000
If you want to take average of every 5 minutes you may use lubridate's function floor_date/ceiling_date to round the time.
library(dplyr)
library(lubridate)
df %>%
mutate(time = mdy_hms(time),
time = floor_date(time, '5 mins')) %>%
group_by(time) %>%
summarise(value = mean(value, na.rm = TRUE))
# time value
# <dttm> <dbl>
#1 2021-03-06 06:15:00 22.3
#2 2021-03-06 06:20:00 24.5
#3 2021-03-06 06:25:00 28
#4 2021-03-06 06:30:00 25

Cannot filter column when name of that column comes from variable

As default I set the argument cut.points as NA and if it's on default then it shouldn't do anything with the data.
But if user decides to put for example cut.points = c("2012-01-01", "2013-01-01") then the data should be filtered by the column that has dates in it. And it should return only dates between 2012 to 2013.
The problem is that I'm reading data from the function so in theory i won't know what is the name of this date column that uses provides. So i find the column with dates and store it's name in the variable.
But the condition which i wrote that should filter based od this variable doesn't work:
modifier <- function(input.data, cut.points = c(NA, NA)) {
date_check <- sapply(input.data, function(x) !all(is.na(as.Date(as.character(x),format="%Y-%m-%d"))))
if (missing(cut.points)) {
input.data
} else {
cols <- colnames(select_if(input.data, date_check == TRUE))
cut.points <- as.Date(cut.points)
input.data <- filter(input.data, cols > cut.points[1] & cols < cut.points[2])
}
}
for ex. when i try to run this:
modifier(ex_data, cut.points = c("2012-01-01", "2013-01-01"))
On sample like this:
ex_data
Row.ID Order.ID Order.Date
1 32298 CA-2012-124891 2012-07-31
2 26341 IN-2013-77878 2013-02-05
3 25330 IN-2013-71249 2013-10-17
4 13524 ES-2013-1579342 2013-01-28
5 47221 SG-2013-4320 2013-11-05
6 22732 IN-2013-42360 2013-06-28
7 30570 IN-2011-81826 2011-11-07
8 31192 IN-2012-86369 2012-04-14
9 40155 CA-2014-135909 2014-10-14
10 40936 CA-2012-116638 2012-01-28
11 34577 CA-2011-102988 2011-04-05
12 28879 ID-2012-28402 2012-04-19
13 45794 SA-2011-1830 2011-12-27
14 4132 MX-2012-130015 2012-11-13
15 27704 IN-2013-73951 2013-06-06
16 13779 ES-2014-5099955 2014-07-31
17 36178 CA-2014-143567 2014-11-03
18 12069 ES-2014-1651774 2014-09-08
19 22096 IN-2014-11763 2014-01-31
20 49463 TZ-2014-8190 2014-12-05
the error is:
character string is not in a standard unambiguous format
I've added lubridateas a dependency so I could get access to %within% and is.Date. I've also changed the check condition, because I don't think your original one would work with NA, NA.
library(tidyverse)
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#>
#> date, intersect, setdiff, union
ex_data <- read_table(" Row.ID Order.ID Order.Date
1 32298 CA-2012-124891 2012-07-31
2 26341 IN-2013-77878 2013-02-05
3 25330 IN-2013-71249 2013-10-17
4 13524 ES-2013-1579342 2013-01-28
5 47221 SG-2013-4320 2013-11-05
6 22732 IN-2013-42360 2013-06-28
7 30570 IN-2011-81826 2011-11-07
8 31192 IN-2012-86369 2012-04-14
9 40155 CA-2014-135909 2014-10-14
10 40936 CA-2012-116638 2012-01-28
11 34577 CA-2011-102988 2011-04-05
12 28879 ID-2012-28402 2012-04-19
13 45794 SA-2011-1830 2011-12-27
14 4132 MX-2012-130015 2012-11-13
15 27704 IN-2013-73951 2013-06-06
16 13779 ES-2014-5099955 2014-07-31
17 36178 CA-2014-143567 2014-11-03
18 12069 ES-2014-1651774 2014-09-08
19 22096 IN-2014-11763 2014-01-31
20 49463 TZ-2014-8190 2014-12-05")
#> Warning: Missing column names filled in: 'X1' [1]
modifier <- function(input.data, cut.points = NULL) {
if (length(cut.points) == 2) {
date_col <- colnames(input.data)[sapply(input.data, is.Date)]
filtered.data <- input.data %>%
rename(Date = !! date_col) %>%
filter(Date %within% interval(cut.points[1], cut.points[2])) %>%
rename_with(~ date_col, Date)
return(filtered.data)
} else {
input.data
}
}
modifier(ex_data, cut.points = c("2012-01-01", "2013-01-01"))
#> # A tibble: 5 x 4
#> X1 Row.ID Order.ID Order.Date
#> <dbl> <dbl> <chr> <date>
#> 1 1 32298 CA-2012-124891 2012-07-31
#> 2 8 31192 IN-2012-86369 2012-04-14
#> 3 10 40936 CA-2012-116638 2012-01-28
#> 4 12 28879 ID-2012-28402 2012-04-19
#> 5 14 4132 MX-2012-130015 2012-11-13

how do you make a sequence using along.with for unique values in r

Lets suppose I have a vector of numeric values
[1] 2844 4936 4936 4972 5078 6684 6689 7264 7264 7880 8133 9018 9968 9968 10247
[16] 11267 11508 11541 11607 11717 12349 12349 12364 12651 13025 13086 13257 13427 13427 13442
[31] 13442 13442 13442 14142 14341 14429 14429 14429 14538 14872 15002 15064 15163 15163 15324
[46] 15324 15361 15361 15400 15624 15648 15648 15648 15864 15864 15881 16332 16847 17075 17136
[61] 17136 17196 17843 17925 17925 18217 18455 18578 18578 18742 18773 18806 19130 19195 19254
[76] 19254 19421 19421 19429 19585 19686 19729 19729 19760 19760 19901 20530 20530 20530 20581
[91] 20629 20629 20686 20693 20768 20902 20980 21054 21079 21156
and I want to create a sequence along this vector but for unique numbers. for example
length(unique(vector))
is 74 and there are a total of 100 values in the vector. The sequence should have numbers ranging from 1 - 74 only but with length 100 as some numbers will be repeated.
Any idea on how this can be done?
Thanks.
Perhaps
res <- as.numeric(factor(v1))
head(res)
#[1] 1 2 2 3 4 5
Or
res1 <- match(v1, unique(v1))
Or
library(fastmatch)
res2 <- fmatch(v1, unique(v1))
Or
res3 <- findInterval(v1, unique(v1))
data
v1 <- c(2844, 4936, 4936, 4972, 5078, 6684, 6689, 7264, 7264, 7880,
8133, 9018, 9968, 9968, 10247, 11267, 11508, 11541, 11607, 11717,
12349, 12349, 12364, 12651, 13025, 13086, 13257, 13427, 13427,
13442, 13442, 13442, 13442, 14142, 14341, 14429, 14429, 14429,
14538, 14872, 15002, 15064, 15163, 15163, 15324, 15324, 15361,
15361, 15400, 15624, 15648, 15648, 15648, 15864, 15864, 15881,
16332, 16847, 17075, 17136, 17136, 17196, 17843, 17925, 17925,
18217, 18455, 18578, 18578, 18742, 18773, 18806, 19130, 19195,
19254, 19254, 19421, 19421, 19429, 19585, 19686, 19729, 19729,
19760, 19760, 19901, 20530, 20530, 20530, 20581, 20629, 20629,
20686, 20693, 20768, 20902, 20980, 21054, 21079, 21156)
You could use .GRP from "data.table" for this:
library(data.table)
y <- as.data.table(x)[, y := .GRP, by = x]
head(y)
# x y
# 1: 2844 1
# 2: 4936 2 ## Note the duplicated value
# 3: 4936 2 ## in these rows, corresponding to x
# 4: 4972 3
# 5: 5078 4
# 6: 6684 5
tail(y)
# x y
# 1: 20768 69
# 2: 20902 70
# 3: 20980 71
# 4: 21054 72
# 5: 21079 73
# 6: 21156 74 ## "y" values go to 74

Resources