It's best explained with an example.
I have a vector, or column from data.frame named vec:
vec <- c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA)
I would like a vectorized process (not a for loop) to change the three trailing NA when a 1 is observed.
The end vector would be:
c(NA, NA, 1, 1, 1, 1, NA, 1, 1, 1, 1, NA, NA, NA)
If we had:
vec <- c(NA, NA, 1, NA, 1, NA, NA, 1, NA, NA, NA, NA, NA, NA)
The end vector would look like:
c(NA, NA, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, NA, NA)
A very badly written solution is:
vec2 <- vec
for(i in index(v)){
if(!is.na(v[i])) vec2[i] <- 1
if(i>3){
if(!is.na(vec[i-1])) vec2[i] <- 1
if(!is.na(vec[i-2])) vec2[i] <- 1
if(!is.na(vec[i-3])) vec2[i] <- 1
}
if(i==3){
if(!is.na(vec[i-1])) vec2[i] <- 1
if(!is.na(vec[i-2])) vec2[i] <- 1
}
if(i==2){
if(!is.na(vec[i-1])) vec2[i] <- 1
}
}
Another option:
`[<-`(vec,c(outer(which(vec==1),1:3,"+")),1)
# [1] NA NA 1 1 1 1 NA 1 1 1 1 NA NA NA
Although the above works with the examples, it stretches the length of vec if a 1 is found in the last positions. Better to make a simple check and wrap into a function:
threeNAs<-function(vec) {
ind<-c(outer(which(vec==1),1:3,"+"))
ind<-ind[ind<=length(vec)]
`[<-`(vec,ind,1)
}
Another fast solution:
vec[rep(which(vec == 1), each = 3) + c(1:3)] <- 1
which gives:
> vec
[1] NA NA 1 1 1 1 NA 1 1 1 1 NA NA NA
Benchmarking is only really useful when done on larger datasets. A benchmark with a 10k larger vector and the several posted solutions:
library(microbenchmark)
microbenchmark(ans.jaap = {vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
vec[rep(which(vec == 1), each = 3) + c(1:3)] <- 1},
ans.989 = {vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
r <- which(vec==1);
vec[c(mapply(seq, r, r+3))] <- 1},
ans.sotos = {vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
vec[unique(as.vector(t(sapply(which(vec == 1), function(i) seq(i+1, length.out = 3)))))] <- 1},
ans.gregor = {vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
vec[is.na(vec)] <- 0;
n <- length(vec);
vec <- vec + c(0, vec[1:(n-1)]) + c(0, 0, vec[1:(n-2)]) + c(0, 0, 0, vec[1:(n-3)]);
vec[vec == 0] <- NA},
ans.moody = {vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
output <- sapply(1:length(vec),function(i){any(!is.na(vec[max(0,i-3):i]))});
output[output] <- 1;
output[output==0] <- NA},
ans.nicola = {vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
`[<-`(vec,c(outer(which(vec==1),1:3,"+")),1)})
which gives the following benchmark:
Unit: microseconds
expr min lq mean median uq max neval cld
ans.jaap 1778.905 1937.414 3064.686 2100.595 2257.695 86233.593 100 a
ans.989 87688.166 89638.133 96992.231 90986.269 93326.393 182431.366 100 c
ans.sotos 125344.157 127968.113 132386.664 130117.438 132951.380 214460.174 100 d
ans.gregor 4036.642 5824.474 10861.373 6533.791 7654.587 87806.955 100 b
ans.moody 173146.810 178369.220 183698.670 180318.799 184000.062 264892.878 100 e
ans.nicola 966.927 1390.486 1723.395 1604.037 1904.695 3310.203 100 a
What really is 'vectorised', if not a loop written in a C-language?
Here's a C++ loop that benchmarks well.
vec <- c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA)
library(Rcpp)
cppFunction('NumericVector fixVec(NumericVector myVec){
int n = myVec.size();
int foundCount = 0;
for(int i = 0; i < n; i++){
if(myVec[i] == 1) foundCount = 1;
if(ISNA(myVec[i])){
if(foundCount >= 1 & foundCount <= 3){
myVec[i] = 1;
foundCount++;
}
}
}
return myVec;
}')
fixVec(vec)
# [1] NA NA 1 1 1 1 NA 1 1 1 1 NA NA NA
Benchmarks
library(microbenchmark)
microbenchmark(
ans.jaap = {
vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
vec[rep(which(vec == 1), each = 4) + c(0:3)] <- 1
},
ans.nicola = {
vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
`[<-`(vec,c(outer(which(vec==1),0:3,"+")),1)
},
ans.symbolix = {
vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
vec <- fixVec(vec)
}
)
# Unit: microseconds
# expr min lq mean median uq max neval
# ans.jaap 2017.789 2264.318 2905.2437 2579.315 3588.4850 4667.249 100
# ans.nicola 1242.002 1626.704 3839.4768 2095.311 3066.4795 81299.962 100
# ans.symbolix 504.577 533.426 838.5661 718.275 966.9245 2354.373 100
vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4)
vec <- fixVec(vec)
vec2 <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4)
vec2[rep(which(vec2 == 1), each = 4) + c(0:3)] <- 1
identical(vec, vec2)
# [1] TRUE
The following code does what you asked for. It involves "shifting" the vector and then adding the shifted versions
vec[is.na(vec)] <- 0
n <- length(vec)
vec <- vec + c(0, vec[1:(n-1)]) + c(0, 0, vec[1:(n-2)]) + c(0, 0, 0, vec[1:(n-3)])
vec[vec == 0] <- NA
vec[vec != 0] <- 1
# vec | 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0 ,0, 0
# c(0, vec[1:(n-1)]) | + 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0 ,0, 0
# c(0, 0, vec[1:(n-2)]) | + 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0 ,0
# c(0,0,0,vec[1:(n-3)]) | + 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0
# |-------------------------------------------
# | 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0
A non-Vectorized solution, but nevertheless, another option using base R,
vec[unique(as.vector(t(sapply(which(vec == 1), function(i) seq(i+1, length.out = 3)))))] <- 1
vec
#[1] NA NA 1 1 1 1 NA 1 1 1 1 NA NA NA
vec1[unique(as.vector(t(sapply(which(vec1 == 1), function(i) seq(i+1, length.out = 3)))))] <- 1
vec1
#[1] NA NA 1 1 1 1 1 1 1 1 1 NA NA NA
How about this:
r <- which(vec==1)
vec[c(mapply(seq, r, r+3))] <- 1
Examples:
vec <- c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA)
#[1] NA NA 1 1 1 1 NA 1 1 1 1 NA NA NA
vec <- c(NA, NA, 1, NA, 1, NA, NA, 1, NA, NA, NA, NA, NA, NA)
#[1] NA NA 1 1 1 1 1 1 1 1 1 NA NA NA
With sapply, any, and is.na:
output <- sapply(1:length(vec),function(i){any(!is.na(vec[max(0,i-3):i]))})
output[output] <- 1
output[output==0] <- NA
Related
Df <- data.frame(prop1 = c(NA, NA, NA, "French", NA, NA,NA, "-29 to -20", NA, NA, NA, "Pop", NA, NA, NA, "French", "-29 to -20", "Pop"),
prop1_rank = c(NA, NA, NA, 0, NA, NA,NA, 11, NA, NA, NA, 1, NA, NA, NA, 40, 0, 2),
prop2 = c(NA, NA, NA, "Spanish", NA, NA,NA, "-19 to -10", NA, NA, NA, "Rock", NA, NA, NA, "Spanish", "-19 to -10", "Rock"),
prop2_rank = c(NA, NA, NA, 10, NA, NA,NA, 4, NA, NA, NA, 1, NA, NA, NA, 1, 0, 2),
initOSF1 = c(NA, NA, NA, NA, NA, "French", NA,NA,NA, "-29 to -20", NA, NA, NA, "Pop", NA, NA, NA, NA),
initOSF1_freq = c(NA, NA, NA, NA, NA, 66, NA,NA,NA, 0, NA, NA, NA, 14, NA, NA, NA, NA),
initOSF2 = c(NA, NA, NA, NA, NA, "Spanish", NA,NA,NA, "-19 to -10", NA, NA, NA, "Rock", NA, NA, NA, NA),
initOSF2_freq = c(NA, NA, NA, NA, NA, 0, NA,NA,NA, 6, NA, NA, NA, 14, NA, NA, NA, NA))
Df
I would like to organize this into
3 columns consisting: c("propositions", "ranks", "freqs"),
where,
Propositions column has the values: "French", "Spanish", "-29 to -20", "19 to -10", "Pop", "Rock", and having a separate columns for the rank values e.g., 0 for French, 10 for Spanish, etc., and frequency values e.g., 66 for French, 0 for Spanish, etc.
This is not an easy one. Probably a better solution exists:
library(tidyverse)
library(data.table)
setDT(Df) %>%
select(contains(c('prop', 'rank', 'freq'))) %>%
filter(!if_all(everything(), is.na)) %>%
melt(measure.vars = patterns(c('prop.$', 'rank$', 'freq'))) %>%
group_by(gr=cumsum(!is.na(value1)))%>%
summarise(across(-variable, ~if(length(.x)>1) na.omit(.x) else .x))
# A tibble: 12 x 4
gr value1 value2 value3
<int> <chr> <dbl> <dbl>
1 1 French 0 66
2 2 -29 to -20 11 0
3 3 Pop 1 14
4 4 French 40 NA
5 5 -29 to -20 0 NA
6 6 Pop 2 NA
7 7 Spanish 10 0
8 8 -19 to -10 4 6
9 9 Rock 1 14
10 10 Spanish 1 NA
11 11 -19 to -10 0 NA
12 12 Rock 2 NA
Here's some example code.
df <- structure(list(v1 = c(1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
1, 1, 1, 1, 0, 1, 0, 0, 1), v2 = c(1, 0, 1, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1), flag = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA)), class = "data.frame", row.names = c(NA, -22L))
I am interested in coding the variable "flag" such that when v1 = 0 and the next v2 = 0, both rows get a 'flag' in the flag column. If a flag has already been placed, it cannot be changed (i.e., row 5 would not be flagged alone, but was already flagged when looking at row 4)
Here is the desired dataframe.
df2 <- structure(list(v1 = c(1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
1, 1, 1, 1, 0, 1, 0, 0, 1), v2 = c(1, 0, 1, 1, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1), flag = structure(c(NA,
NA, NA, 1L, 1L, 1L, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 1L, 1L, NA), .Label = "flag", class = "factor")), class = "data.frame", row.names = c(NA,
-22L))
I have started with the code below, which matches the conditions I would like, but only changes the row matching the v1 condition, not both.
df2 <- df %>%
mutate( flag = case_when(v1 == 0 & lead(v2)== 0 ~ 'flag'))
This is a very simplified version of my true data and I know there are options other than using case_when, but I would really like to use the case_when. function for this (I would also be open to using ifelse.
library(tidyverse)
df %>%
mutate(f = v1 == 0 & lead(v2) == 0,
flag = ifelse(f|lag(f), 'flag', NA), f = NULL)
v1 v2 flag
1 1 1 <NA>
2 1 0 <NA>
3 0 1 <NA>
4 0 1 flag
5 0 0 flag
6 0 1 flag
7 1 0 flag
8 1 1 <NA>
9 0 0 <NA>
10 1 1 <NA>
11 0 0 <NA>
12 1 1 <NA>
13 0 0 <NA>
14 1 1 <NA>
15 1 0 <NA>
16 1 0 <NA>
17 1 0 <NA>
18 0 1 <NA>
19 1 1 <NA>
20 0 1 flag
21 0 0 flag
22 1 1 <NA>
here example of my data
mydat=structure(list(ADR.N.14.0 = c(8140010250001, 8140010250002),
NOMYAR.N.16.6 = c(1, 1), KOFPOR1.N.16.6 = c(7, 10), POR1.C.254 = c("о",
"BB"), VOZPOR1.N.16.6 = c(80, 45), VYSPOR1.N.16.6 = c(24,
17), DEMPOR1.N.16.6 = c(36, 16), POLNOT1.N.16.6 = c(0.6,
0.9), ZAPZAH1.N.16.6 = c(210, 160), NOMYAR2.N.16.6 = c(1,
1), KOFSOCT2.N.16.6 = c(3, 0), POR2.C.254 = c("BB", "о"),
VOZPOR2.N.16.6 = c(70, 45), VYSPOR2.N.16.6 = c(22, 17), DEMPOR2.N.16.6 = c(26,
22), POLNOT2.N.16.6 = c(0, 0), ZAPZAH2.N.16.6 = c(0, 0)), class = "data.frame", row.names = c(NA,
-2L))
how for each value of ADR,N,14,0move data from one variable under another.
To be more clear
here variables with prefix1
NOMYAR,N,16,6 KOFPOR**1**,N,16,6 POR**1**,C,254 VOZPOR**1**,N,16,6 VYSPOR**1**,N,16,6 DEMPOR**1**,N,16,6 POLNOT**1**,N,16,6 ZAPZAH**1**,N,16,6
and near rows with prefix2
NOMYAR**2**,N,16,6 KOFPOR**2**,N,16,6 POR**2**,C,254 VOZPOR**2**,N,16,6 VYSPOR**1**,N,16,6 DEMPOR**2**,N,16,6 POLNOT**2**,N,16,6 ZAPZAH**2**,N,16,6
so i need that for for ADR,N,14,0 =8140010250001
the content of the fields with the prefix 2 was under the content of the fields with the prefix 1
like this
result=structure(list(ADR.N.14.0 = c(8140010250001, 8140010250001, 8140010250002,
8140010250002, NA, NA, NA, NA, NA, NA), NOMYAR.N.16.6 = c(1,
1, 1, 1, NA, NA, NA, NA, NA, NA), KOFPOR1.N.16.6 = c(7, 3, 10,
0, NA, NA, NA, NA, NA, NA), POR1.C.254 = c("о", "BB", "BB", "о",
"", "", "", "", "", ""), VOZPOR1.N.16.6 = c(80, 70, 45, 45, NA,
NA, NA, NA, NA, NA), VYSPOR1.N.16.6 = c(24, 22, 17, 17, NA, NA,
NA, NA, NA, NA), DEMPOR1.N.16.6 = c(36, 26, 16, 22, NA, NA, NA,
NA, NA, NA), POLNOT1.N.16.6 = c(0.6, 0, 0.9, 0, NA, NA, NA, NA,
NA, NA), ZAPZAH1.N.16.6 = c(210, 0, 160, 0, NA, NA, NA, NA, NA,
NA)), class = "data.frame", row.names = c(NA, -10L))
How can i do such transpose?
You can use pivot_longer and specify names_pattern to include pattern of names that you want together.
tidyr::pivot_longer(mydat, cols = -ADR.N.14.0,
names_to = c('.value'),
names_pattern = '(.*?)\\d?\\..*')
# ADR.N.14.0 NOMYAR KOFPOR POR VOZPOR VYSPOR DEMPOR POLNOT ZAPZAH KOFSOCT
# <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 8140010250001 1 7 о 80 24 36 0.6 210 3
#2 8140010250001 1 NA BB 70 22 26 0 0 NA
#3 8140010250002 1 10 BB 45 17 16 0.9 160 0
#4 8140010250002 1 NA о 45 17 22 0 0 NA
I have a dataset df like this, which is the data collected from individuals using a repeating instrument:
ID <- c('A1', 'A1', 'A1', 'A1', 'A2', 'A2', 'A2', 'A2', 'A2', 'A2', 'A3', 'A3', 'A3', 'A3', 'A4', 'A4', 'A4', 'A4', 'A4', 'A4', 'A4', 'A5', 'A5', 'A5', 'A5', 'A5', 'A5')
day_stat <- c(2, 1, 1, 2, 2, 2, 2, 1, 1, 1, 2, NA, NA, NA, NA, NA, 1, 1, 1, NA, NA, 1, 2, 2, 2, 1, NA)
adm_dat <- c(NA, NA, NA, NA, NA, NA, NA, '2020-10-12', NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, '2020-10-18', NA, NA)
adm_ever <- c(NA, NA, NA, 1, NA, NA, NA, NA, NA, 2, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA)
df <- data.frame(ID, day_stat, adm_dat, adm_ever)
I am trying to filter the data like this:
df1 = df %>% filter(day_stat==1 | adm_dat!= NA | adm_ever==1)
Current result (not wanted):
Desired Output:
If one of these filter conditions is true for an ID, then keep all event data of that ID.
To check for NA values use is.na and to select entire group use group_by :
library(dplyr)
df %>%
group_by(ID) %>%
filter(any(day_stat==1 | !is.na(adm_dat) | adm_ever==1))
# ID day_stat adm_dat adm_ever
# <chr> <dbl> <chr> <dbl>
# 1 A1 2 NA NA
# 2 A1 1 NA NA
# 3 A1 1 NA NA
# 4 A1 2 NA 1
# 5 A2 2 NA NA
# 6 A2 2 NA NA
# 7 A2 2 NA NA
# 8 A2 1 2020-10-12 NA
# 9 A2 1 NA NA
#10 A2 1 NA 2
# … with 13 more rows
We can use data.table
library(data.table)
setDT(df)[, .SD[any(day_stat==1 | !is.na(adm_dat) | adm_ever==1)], ID]
I want to select a number of variables based on thier names to transform them. The variable names all start with inq and end with 7, 8, 10, 13:15. This is not working for me... Apologies if this is obvious, but I cannot get it to work. Am I using the wrong functions, putting my functions and arguments together wrong, or something else?
A reproducible example:
structure(list(inq1_1 = c(NA, 7, 5, 1, 1, 6, 5, 2, NA, NA), inq1_2 = c(NA,
7, 5, 1, 1, 6, 5, 5, NA, NA), inq1_3 = c(NA, 6, 4, 2, 1, 5, 2,
1, NA, NA), inq1_4 = c(NA, 6, 6, 1, 1, 6, 5, 1, NA, NA), inq1_5 = c(NA,
7, 3, 1, 1, 6, 2, 1, NA, NA), inq1_6 = c(NA, 7, 4, 4, 2, 7, 2,
4, NA, NA), inq1_7 = c(NA, 2, 4, 6, 7, 3, 1, 7, NA, NA), inq1_8 = c(NA,
1, NA, 2, 7, 2, 1, 4, NA, NA), inq1_9 = c(NA, 4, 6, 3, 1, 3,
7, 1, NA, NA), inq1_10 = c(NA, 3, 5, 7, 4, 4, 2, 7, NA, NA),
inq1_11 = c(NA, 5, 4, 7, 1, 6, 7, 6, NA, NA), inq1_12 = c(NA,
7, 5, 7, 4, 6, 7, 2, NA, NA), inq1_13 = c(NA, 3, 4, 6, 4,
3, 4, 4, NA, NA), inq1_14 = c(NA, 3, 2, 4, 4, 2, 1, 4, NA,
NA), inq1_15 = c(NA, 2, 2, 3, 5, 2, 4, 4, NA, NA), inqfinal_1 = c(5,
NA, 3, NA, NA, NA, NA, NA, NA, NA), inqfinal_2 = c(5, NA,
3, NA, NA, NA, NA, NA, NA, NA), inqfinal_3 = c(6, NA, 3,
NA, NA, NA, NA, NA, NA, NA), inqfinal_4 = c(5, NA, 3, NA,
NA, NA, NA, NA, NA, NA), inqfinal_5 = c(5, NA, 3, NA, NA,
NA, NA, NA, NA, NA), inqfinal_6 = c(6, NA, 3, NA, NA, NA,
NA, NA, NA, NA), inqfinal_7 = c(4, NA, 3, NA, NA, NA, NA,
NA, NA, NA), inqfinal_8 = c(2, NA, 3, NA, NA, NA, NA, NA,
NA, NA), inqfinal_9 = c(5, NA, 3, NA, NA, NA, NA, NA, NA,
NA), inqfinal_10 = c(4, NA, 3, NA, NA, NA, NA, NA, NA, NA
), inqfinal_11 = c(6, NA, 4, NA, NA, NA, NA, NA, NA, NA),
inqfinal_12 = c(6, NA, 4, NA, NA, NA, NA, NA, NA, NA), inqfinal_13 = c(4,
NA, 3, NA, NA, NA, NA, NA, NA, NA), inqfinal_14 = c(2, NA,
2, NA, NA, NA, NA, NA, NA, NA), inqfinal_15 = c(2, NA, 2,
NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
I am trying to become tidy and utilising dplyr as per the code below:
# select specific columns
sf_df %>% select(starts_with("inq"),
ends_with(7, 8, 10, 13:15)) %>% view(title = "test")
Alas, I get the following error:
Error in ends_with(7, 8, 10, 13:15) : unused argument (13:15)
14. .f(.x[[i]], ...)
13. map(.x[sel], .f, ...)
12. map_if(ind_list, is_helper, eval_tidy)
11. vars_select_eval(.vars, quos)
10. tidyselect::vars_select(names(.data), !!!quos(...))
9. select.data.frame(., starts_with("inq"), ends_with(7, 8, 10, 13:15))
8. select(., starts_with("inq"), ends_with(7, 8, 10, 13:15))
7. function_list[[i]](value)
6. freduce(value, `_function_list`)
5. `_fseq`(`_lhs`)
4. eval(quote(`_fseq`(`_lhs`)), env, env)
3. eval(quote(`_fseq`(`_lhs`)), env, env)
2. withVisible(eval(quote(`_fseq`(`_lhs`)), env, env))
1. sf_df %>% select(starts_with("inq"), ends_with(7, 8, 10, 13:15)) %>% view(title = "test")
Any help would be greatly appreciated! Thank you in advance.
Cheers,
Atanas.
A better option would be matches to match a regex pattern in the column name. Here, it matches the pattern 'ing' at the beginning (^) of the column name and numbers at the end ($) of the column name
sf_df %>%
select(matches('^inq.*(7|8|10|13|14|15)$'))
# A tibble: 10 x 12
# inq1_7 inq1_8 inq1_10 inq1_13 inq1_14 inq1_15 inqfinal_7 inqfinal_8 inqfinal_10 inqfinal_13 inqfinal_14 inqfinal_15
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 NA NA NA NA NA NA 4 2 4 4 2 2
# 2 2 1 3 3 3 2 NA NA NA NA NA NA
# 3 4 NA 5 4 2 2 3 3 3 3 2 2
# 4 6 2 7 6 4 3 NA NA NA NA NA NA
# 5 7 7 4 4 4 5 NA NA NA NA NA NA
# 6 3 2 4 3 2 2 NA NA NA NA NA NA
# 7 1 1 2 4 1 4 NA NA NA NA NA NA
# 8 7 4 7 4 4 4 NA NA NA NA NA NA
# 9 NA NA NA NA NA NA NA NA NA NA NA NA
#10 NA NA NA NA NA NA NA NA NA NA NA NA
Note that by using both starts_with and ends_with, the desired result may not be the expected one. The OP's dataset has 30 columns where all the column names start with 'inq'. So, with starts_with, it returns all columns, and adding ends_with, it is checking an OR match, e.g.
sf_df %>%
select(starts_with("inq"), ends_with("5")) %>%
ncol
#[1] 30 # returns 30 columns
It is not removing the columns that have no match for 5 at the string
It is not a behavior of the order of arguments as
sf_df %>%
select(ends_with("5"), starts_with("inq")) %>%
ncol
#[1] 30
Now, if we use only ends_with
sf_df %>%
select(ends_with("5")) %>%
ncol
#[1] 4
Based on the example, all columns starts with 'inq', so, ends_with alone would be sufficient for a single string match as the documentation for ?ends_with specifies
match - A string.
and not multiple strings
where the Usage is
starts_with(match, ignore.case = TRUE, vars = peek_vars())