Capitalizing the first letter of characters in a column using substr function - r

I have this data frame
head(df)
## patnum hospstay lowph pltct race bwt gest inout twn lol magsulf
## 1 1 34 NA 100 white 1250 35 born at duke 0 NA NA
## 2 2 9 7.250000 244 white 1370 32 born at duke 0 NA NA
## 3 3 -2 7.059998 114 black 620 23 born at duke 0 NA NA
## 4 4 40 7.250000 182 black 1480 32 born at duke 0 NA NA
## 5 5 2 6.969997 54 black 925 28 born at duke 0 NA NA
## 6 6 62 7.189999 NA white 940 28 born at duke 0 NA NA
## meth toc delivery apg1 vent pneumo pda cld sex dead
## 1 0 0 abdominal 8 0 0 0 0 female 0
## 2 1 0 abdominal 7 0 0 0 0 female 0
## 3 0 1 vaginal 1 1 0 0 NA female 1
## 4 1 0 vaginal 8 0 0 0 0 male 0
## 5 0 0 abdominal 5 1 1 0 0 female 1
## 6 1 0 abdominal 8 1 0 0 0 female 0
The race variable has 4 entries, "white", "black", "native american", "oriental". I am wanting to replace this string with capitalized versions "White", "Black", "Native American", "Oriental". I would like to do this using the substr function. I'm not sure what code to use to accomplish this. I was provided an example below, where the
day_full = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday")
substr(day_full_1, nchar(day_full_1)-2, nchar(day_full_1)) = "DAY"
The result is: "SunDAY", "MonDAY", "TuesDAY", "WednesDAY", "ThursDAY", "FriDAY", "SaturDAY", "SunDAY"
This is similar to what I want to do, but I only want to have the first letter of each of the races to be capitalized. How would I translate this to make each first letter of the 4 races capital?
This is the solution I've tried now.
substr(SB_xlsx$race, 1, 1) <- toupper(substr(SB_xlsx$race, 1, 1))
substr(SB_xlsx$race, 1, 1)
## structure(list(patnum = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
## 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24), hospstay = c(34,
## 9, -2, 40, 2, 62, 32, NA, NA, 28, 38, NA, 62, 69, 1, 93, 44,
## 50, 66, 65, 44, 70, 85, NA), lowph = c(NA, 7.25, 7.059998, 7.25,
## 6.969997, 7.189999, 7.32, NA, NA, 7.16, 7.039997, NA, 7.179996,
## 7.419998, 7.119999, 7.239998, 7.129997, 7.269997, 7.179996, 7.07,
## 7.289997, 7.129997, 7.189999, NA), pltct = c(100, 244, 114, 182,
## 54, NA, 282, NA, NA, 153, 229, NA, 182, 361, 378, 255, 186, NA,
## 260, 183, 134, 229, 68, NA), race = c("white", "white", "black",
## "black", "black", "white", "black", NA, NA, "black", "white",
## NA, "black", "white", "white", "black", "white", "black", "black",
## "white", "white", "black", "white", NA), bwt = c(1250, 1370,
## 620, 1480, 925, 940, 1255, 600, 700, 1350, 1310, 550, 1110, 1180,
## 970, 770, 1490, 1170, 1360, 1330, 1000, 1120, 740, NA), gest = c(35,
## 32, 23, 32, 28, 28, 29.5, 26, 24, 34, 32, 24, 28, 28, 28, 26,
## 33, 31, 31, 31, 28, 29, 26, NA), inout = c("born at duke", "born at duke",
## "born at duke", "born at duke", "born at duke", "born at duke",
## "born at duke", "born at duke", "born at duke", "born at duke",
## "born at duke", "born at duke", "born at duke", "born at duke",
## "born at duke", "born at duke", "born at duke", "born at duke",
## "born at duke", "born at duke", "born at duke", "born at duke",
## "born at duke", NA), twn = c(0, 0, 0, 0, 0, 0, 0, NA, NA, 0,
## 0, NA, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, NA), lol = c(NA, NA,
## NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
## NA, NA, NA, NA, NA, NA), magsulf = c(NA, NA, NA, NA, NA, NA,
## NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
## NA, NA), meth = c(0, 1, 0, 1, 0, 1, 1, NA, NA, 1, 0, NA, 0, 0,
## 1, 1, 1, 1, 1, 1, 0, 1, 0, NA), toc = c(0, 0, 1, 0, 0, 0, 0,
## NA, NA, 0, 0, NA, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, NA), delivery = c("abdominal",
## "abdominal", "vaginal", "vaginal", "abdominal", "abdominal",
## "vaginal", NA, NA, "abdominal", "vaginal", NA, "vaginal", "abdominal",
## "vaginal", "vaginal", "abdominal", "vaginal", "vaginal", "vaginal",
## "vaginal", "vaginal", "abdominal", NA), apg1 = c(8, 7, 1, 8,
## 5, 8, 9, NA, NA, 4, 6, NA, 6, 6, 2, 4, 8, 7, 1, 8, 5, 9, 9, NA
## ), vent = c(0, 0, 1, 0, 1, 1, 0, NA, NA, 0, 1, NA, 0, 0, 1, 1,
## 0, 0, 1, 1, 0, 1, 0, NA), pneumo = c(0, 0, 0, 0, 1, 0, 0, NA,
## NA, 0, 0, NA, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, NA), pda = c(0,
## 0, 0, 0, 0, 0, 0, NA, NA, 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0,
## 0, 0, NA), cld = c(0, 0, NA, 0, 0, 0, 0, NA, NA, 0, 0, NA, 1,
## 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, NA), sex = c("female", "female",
## "female", "male", "female", "female", "female", NA, NA, "female",
## "male", NA, "male", "male", "female", "male", "male", "female",
## "male", "male", "female", "female", "female", NA), dead = c(0,
## 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
## 0, NA)), class = "data.frame", row.names = c(NA, -24L))

Two solutions:
df <- structure(list(patnum = 1:6, hospstay = c(34L, 9L, -2L, 40L, 2L, 62L), lowph = c(NA, 7.25, 7.059998, 7.25, 6.969997, 7.189999), pltct = c(100L, 244L, 114L, 182L, 54L, NA), race = c("white", "white", "black", "black", "black", "white"), bwt = c(1250L, 1370L, 620L, 1480L, 925L, 940L), gest = c(35L, 32L, 23L, 32L, 28L, 28L), inout = c("born_at_duke", "born_at_duke", "born_at_duke", "born_at_duke", "born_at_duke", "born_at_duke"), twn = c(0L, 0L, 0L, 0L, 0L, 0L), lol = c(NA, NA, NA, NA, NA, NA), magsulf = c(NA, NA, NA, NA, NA, NA)), class = "data.frame", row.names = c("1", "2", "3", "4", "5", "6"))
tools::toTitleCase(df$race)
# [1] "White" "White" "Black" "Black" "Black" "White"
But those are simpler with no spaces, let's create one for this exercise:
vec <- c("white", "black", "native american")
tools::toTitleCase(vec)
# [1] "White" "Black" "Native American"
We can also use gregexpr/regmatches to do it:
gre <- gregexpr("(?<=^| ).", vec, perl=TRUE)
regmatches(vec, gre)
# [[1]]
# [1] "w"
# [[2]]
# [1] "b"
# [[3]]
# [1] "n" "a"
regmatches(vec, gre) <- lapply(regmatches(vec, gre), toupper)
vec
# [1] "White" "Black" "Native American"
I'm sure there's a stringr-variant out there as well.
As for substr, it's feasible to use regex to find all (1) first-chars and (2) all chars that follow a space, then extract each one, then toupper-them, then put that back into place ... but at that point you're still using regex and effectively doing what toTitleCase is doing natively and what this gregexpr/regmatches code is doing a little more verbosely.
If all you wanted to do was replace the first character, though, and not care about letters after spaces, then
substr(vec, 1, 1) <- toupper(substr(vec, 1, 1))
vec
# [1] "White" "Black" "Native american"
though in this case, I think the lower-case "a" in "Native american" is wrong, so I don't think this is the best approach.
Scaling
Since you are concerned about scaling (I'm assuming you're venturing into 100K or more, since less than that is not going to be an issue with any method demonstrated), here's a comparison:
bench::mark(
toTitleCase = tools::toTitleCase(vec),
gregexpr = {
gre <- gregexpr("(?<=^| ).", vec, perl=TRUE)
regmatches(vec, gre) <- lapply(regmatches(vec, gre), toupper)
vec
}
)
# # A tibble: 2 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 toTitleCase 401us 474us 1735. 4.15KB 0 868 0 500ms <chr [3]> <Rprofmem [9 x 3]> <bench_~ <tibble~
# 2 gregexpr 111us 205us 5240. 24.28KB 2.26 2315 1 442ms <chr [3]> <Rprofmem [6 x 3]> <bench_~ <tibble~
Granted, vec size 3 is pretty small, let's scale that up a bit.
vec30000 <- rep(vec, 10000) # 30000 length
bench::mark(
toTitleCase = tools::toTitleCase(vec30000),
gregexpr = {
gre <- gregexpr("(?<=^| ).", vec30000, perl=TRUE)
regmatches(vec30000, gre) <- lapply(regmatches(vec30000, gre), toupper)
vec30000
}
)
# Warning: Some expressions had a GC in every iteration; so filtering is disabled.
# # A tibble: 2 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 toTitleCase 6.01s 6.01s 0.166 36MB 0.832 1 5 6.01s <chr [30,000]> <Rprofmem [~ <bench_t~ <tibble~
# 2 gregexpr 773.13ms 773.13ms 1.29 241MB 2.59 1 2 773.13ms <chr [30,000]> <Rprofmem [~ <bench_t~ <tibble~
Looking at the `itr/sec` column showing iterations per second, it appears that even at scale, the gregexpr method works better. (If you look at the source code for toTitleCase, you'll see why: it's consider a lot more than just space-delimited words, it's also consider linking words, exception-words, etc.)

Another way is to use perl substitution:
gsub('\\b(\\w)', '\\U\\1', vec, perl = TRUE)
[1] "White" "Black" "Native American"
This method is way faster (ie 35+ times Faster) than the gregexpr method mentioned before:
microbenchmark::microbenchmark(
gsub = gsub('\\b(\\w)', '\\U\\1', vec30000, perl = TRUE),
gregexpr = {
gre <- gregexpr("(?<=^| ).", vec30000, perl=TRUE)
regmatches(vec30000, gre) <- lapply(regmatches(vec30000, gre), toupper)
vec30000 },
unit = 'relative', check = 'equal')
Unit: relative
expr min lq mean median uq max neval
gsub 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 5
gregexpr 37.37549 41.10014 29.00345 24.49221 25.39978 25.54325 5

Related

Using xpath in R to scrape data from website with multiple similar paths

I'm trying to scrape in R a list of apartments for sale and the basic info (address, m2, price, rooms, etc.) of this website: https://www.boligsiden.dk/tilsalg/ejerlejlighed?sortAscending=true&priceMin=3000000&priceMax=7000000 (see also below a screenshot of the page + inspect)
Using SelectorGadget i haven't been able to create a path that unique extracts the square meters of all 50 apartments on page 1, and another path that unique extracts the numbers of rooms, etc.
I did manage to find a path that unique extracts the addresses (see in code block below). But this is in a separate block/class from the rest of the text.
Here is my current code:
library(rvest)
library(dplyr)
link = "https://www.boligsiden.dk/tilsalg/ejerlejlighed?sortAscending=true&priceMin=3000000&priceMax=7000000&page=1"
page = read_html(link)
address = page %>% html_nodes("div.mr-2") %>% html_text()
price = #MISSING - CAN'T FIGURE OUT
sqm = #MISSING - CAN'T FIGURE OUT
rooms = #MISSING - CAN'T FIGURE OUT
forsale = data.frame(address, price, sqm, rooms, stringsAsFactors = FALSE)
Any ideas on how to approach it?
I tried using xpath as well to extract the sqm, but only managed to get one specific text field extracted, not all 50 on the page.
Alternative approaches are welcome too. Thanks in advance!
Using their API (found in the network section), you can call on it and retrieve in the information as such:
library(tidyverse)
library(httr2)
"https://api.prod.bs-aws-stage.com/search/cases?addressTypes=condo&priceMax=7000000&priceMin=3000000&per_page=100&page=1&sortAscending=true&sortBy=timeOnMarket" %>%
request() %>%
req_perform() %>%
resp_body_json(simplifyVector = TRUE) %>%
pluck("cases") %>%
unnest(address, names_sep = "_") %>%
mutate(
address = str_c(address_roadName, address_houseNumber, address_zipCode, sep = " "),
.before = 1
) %>%
select(address,
price = priceCash,
sqm = housingArea,
rooms = numberOfRooms)
# A tibble: 100 × 4
address price sqm rooms
<chr> <int> <int> <int>
1 Holsteinsgade 66 2100 3135000 56 2
2 Tuborgvej 60 2900 4875000 114 4
3 Poppellunden 8 4000 3350000 92 3
4 Hyldegårds Tværvej 5 2920 6498000 115 3
5 Grollowstræde 3 3000 3495000 92 3
6 Rasmus Rasks Vej 8 2500 3995000 80 3
7 Ryesgade 7 8000 4598000 110 4
8 Carl Th. Zahles Gade 8 2300 5795000 113 3
9 Strandlodsvej 23E 2300 5495000 101 3
10 Nordre Fasanvej 162 2000 4695000 90 4
# … with 90 more rows
# ℹ Use `print(n = ...)` to see more rows
Which variables are available for extraction:
"https://api.prod.bs-aws-stage.com/search/cases?addressTypes=condo&priceMax=7000000&priceMin=3000000&per_page=100&page=1&sortAscending=true&sortBy=timeOnMarket" %>%
request() %>%
req_perform() %>%
resp_body_json(simplifyVector = TRUE) %>%
pluck("cases") %>%
glimpse
Rows: 100
Columns: 37
$ `_links` <df[,1]> <data.frame[30 x 1]>
$ address <df[,28]> <data.frame[30 x 28]>
$ addressType <chr> "condo", "condo", "condo", "condo", "condo", "condo", "c…
$ caseID <chr> "89194273-5948-4734-8085-fec9d42ac3c2", "ff6a9ff5-eacf-…
$ caseUrl <chr> "https://www.lokalbolig.dk/?sag=26-X0001820", "https://www.…
$ coordinates <df[,3]> <data.frame[30 x 3]>
$ daysOnMarket <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ defaultImage <df[,1]> <data.frame[30 x 1]>
$ descriptionBody <chr> "Lys stuelejlighed med to terrasser i HørsholmNær centrum o…
$ descriptionTitle <chr> "Lys stuelejlighed med to terrasser i Hørsholm", "Fantas…
$ distinction <chr> "real_estate", "real_estate", "real_estate", "real_estate",…
$ energyLabel <chr> "c", "c", "d", "c", "d", "c", "c", "c", "c", "c", "c", "…
$ highlighted <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
$ housingArea <int> 98, 82, 64, 91, 81, 97, 78, 113, 81, 91, 133, 69, 80, 64, 1…
$ images <list> [<data.frame[5 x 1]>], [<data.frame[3 x 1]>], [<data.frame[…
$ monthlyExpense <int> 4183, 3888, 2798, 3205, 3557, 3405, 3233, 2688, 3921, 3907,…
$ nextOpenHouse <df[,4]> <data.frame[30 x 4]>
$ numberOfFloors <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1,…
$ numberOfRooms <int> 3, 3, 2, 3, 3, 4, 3, 4, 3, 4, 3, 3, 3, 2, 4, 2, 3, 4, 2, 4,…
$ pageViews <int> 126, 341, 191, 160, 358, 356, 242, 516, 133, 180, 134, 106…
$ perAreaPrice <int> 40765, 54817, 62422, 71374, 43148, 58711, 60897, 41150, 480…
$ priceCash <int> 3995000, 4495000, 3995000, 6495000, 3495000, 5695000, 47…
$ priceChangePercentage <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ providerCaseID <chr> "26-X000182018025lok", "114-2102", "43000000643cam", "13433…
$ realEstate <df[,3]> <data.frame[30 x 3]>
$ realtor <df[,21]> <data.frame[30 x 21]>
$ slug <chr> "oerbaekgaards-alle-901-0-tv-2970-hoersholm-02239600_901_st…
$ status <chr> "open", "open", "open", "open", "open", "open", "open", "op…
$ timeOnMarket <df[,2]> <data.frame[30 x 2]>
$ totalClickCount <int> 103, 274, 109, 121, 227, 273, 205, 415, 82, 128, 122, 92, 1…
$ totalFavourites <int> 1, 3, 0, 0, 4, 1, 1, 3, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 2,…
$ utilitiesConnectionFee <df[,1]> <data.frame[30 x 1]>
$ yearBuilt <int> 2002, 1886, 1907, 2008, 1932, 1914, 1900, 1926, 1934, 1932,…
$ basementArea <int> NA, NA, NA, NA, NA, NA, NA, NA, 88, NA, NA, NA, NA, NA, …
$ lotArea <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 5327, NA, N…
$ weightedArea <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ secondaryAddressType <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
How you can save the data into your environment
df <- "https://api.prod.bs-aws-stage.com/search/cases?addressTypes=condo&priceMax=7000000&priceMin=3000000&per_page=100&page=1&sortAscending=true&sortBy=timeOnMarket" %>%
request() %>%
req_perform() %>%
resp_body_json(simplifyVector = TRUE) %>%
pluck("cases")
Selectors are kind of convoluted and fragile, but for now it seems to work:
library(rvest)
library(dplyr)
library(purrr)
library(stringr)
url <- "https://www.boligsiden.dk/tilsalg/ejerlejlighed?sortAscending=true&priceMin=3000000&priceMax=7000000"
html <- read_html(url)
html |> html_elements("div.shadow.overflow-hidden.mx-4") |>
map_dfr(\(x)
list(
"address" = html_element(x ,"div.mr-2") |> html_text2() |> str_squish(),
"price" = html_element(x ,"span.text-lg.pr-2") |> html_text(),
"sqm" = html_element(x ,"div.hidden.grid-cols-5.grid-rows-2 > div:nth-child(1) .text-sm" ) |> html_text(),
"rooms" = html_element(x ,"div.hidden.grid-cols-5.grid-rows-2 > div:nth-child(4) .text-sm" ) |> html_text()
)
)
#> # A tibble: 50 × 4
#> address price sqm rooms
#> <chr> <chr> <chr> <chr>
#> 1 Poppellunden 8, 4. tv. Himmelev, 4000 Roskilde 3.350.000 kr. 92 m² 3 Vær.
#> 2 Tuborgvej 60, 2. th. 2900 Hellerup 4.875.000 kr. 114 m² 4 Vær.
#> 3 Hyldegårds Tværvej 5, st. tv. 2920 Charlottenlund 6.498.000 kr. 115 m² 3 Vær.
#> 4 Grollowstræde 3 3000 Helsingør 3.495.000 kr. 92 m² 3 Vær.
#> 5 Ryesgade 7, 2. tv. 8000 Aarhus C 4.598.000 kr. 110 m² 4 Vær.
#> 6 Carl Th. Zahles Gade 8, 2. tv. 2300 København S 5.795.000 kr. 113 m² 3 Vær.
#> 7 Rasmus Rasks Vej 8, 2. tv. 2500 Valby 3.995.000 kr. 80 m² 3 Vær.
#> 8 Strandlodsvej 23E, 1. mf. 2300 København S 5.495.000 kr. 101 m² 3 Vær.
#> 9 Nordre Fasanvej 162, 3. th. 2000 Frederiksberg 4.695.000 kr. 90 m² 4 Vær.
#> 10 Ringstedgade 17B, 1. th. 4000 Roskilde 5.395.000 kr. 137 m² 5 Vær.
#> # … with 40 more rows
Created on 2023-02-01 with reprex v2.0.2

Applying the same operation with multiple columns of similar names in R

I'm wondering if there is a way to simplify this code to avoid repetition givent that the column names are similar excepting one character that increases for each operation.
out <- df %>%
mutate (ATN1.1 = ifelse(Status == 1, NA_integer_, -100 * log(Sen1Ch1/RefCh1)),
ATN2.1 = ifelse(Status == 1, NA_integer_, -100 * log(Sen1Ch2/RefCh2)),
ATN3.1 = ifelse(Status == 1, NA_integer_, -100 * log(Sen1Ch3/RefCh3)),
ATN4.1 = ifelse(Status == 1, NA_integer_, -100 * log(Sen1Ch4/RefCh4)),
ATN5.1 = ifelse(Status == 1, NA_integer_, -100 * log(Sen1Ch5/RefCh5)),
ATN6.1 = ifelse(Status == 1, NA_integer_, -100 * log(Sen1Ch6/RefCh6)),
ATN7.1 = ifelse(Status == 1, NA_integer_, -100 * log(Sen1Ch7/RefCh7)))
This is a small subset of my data if you wanna play with it
df = structure(list(Status = c(1, 17, 1, 1, 1, 1, 2, 0, 0, 0), ATN1.1 = c(NA,
NA, NA, NA, NA, NA, 0, 0.187761662304176, 0.373310604025045,
0.570139498143909), ATN2.1 = c(NA, NA, NA, NA, NA, NA, 0, 0.136443172947395,
0.269071359915515, 0.407552762179439), ATN3.1 = c(NA, NA, NA,
NA, NA, NA, 0, 0.113733164068766, 0.224219770615697, 0.336923929839777
), ATN4.1 = c(NA, NA, NA, NA, NA, NA, 0, 0.0942969310983806,
0.186894753425896, 0.279629737677226), ATN5.1 = c(NA, NA, NA,
NA, NA, NA, 0, 0.0753327883349684, 0.149617411430523, 0.22690457078205
), ATN6.1 = c(NA, NA, NA, NA, NA, NA, 0, 0.0493106158715682,
0.100348708536177, 0.155828822066352), ATN7.1 = c(NA, NA, NA,
NA, NA, NA, 0, 0.0526398637123631, 0.103191368342497, 0.154644102801848
), ATN0.1.1 = c(NA, NA, NA, NA, NA, NA, 15.054824247419, 15.054824247419,
15.054824247419, 15.054824247419), ATN0.2.1 = c(NA, NA, NA, NA,
NA, NA, 24.1338734012274, 24.1338734012274, 24.1338734012274,
24.1338734012274), ATN0.3.1 = c(NA, NA, NA, NA, NA, NA, 27.4233147524393,
27.4233147524393, 27.4233147524393, 27.4233147524393), ATN0.4.1 = c(NA,
NA, NA, NA, NA, NA, 20.8560560826831, 20.8560560826831, 20.8560560826831,
20.8560560826831), ATN0.5.1 = c(NA, NA, NA, NA, NA, NA, 17.1645092239121,
17.1645092239121, 17.1645092239121, 17.1645092239121), ATN0.6.1 = c(NA,
NA, NA, NA, NA, NA, 4.4180613710882, 4.4180613710882, 4.4180613710882,
4.4180613710882), ATN0.7.1 = c(NA, NA, NA, NA, NA, NA, 10.8192165605015,
10.8192165605015, 10.8192165605015, 10.8192165605015), Sen1Ch1 = c(0,
99, 0, 783198, 785643, 787093, 786717, 785935, 784922, 783784
), Sen2Ch1 = c(0, 324, 0, 793643, 796398, 798041, 798658, 798957,
799003, 798951), Sen1Ch2 = c(0, 53, 0, 739627, 741339, 742308,
741804, 741195, 740403, 739520), Sen2Ch2 = c(0, 416, 0, 743716,
745420, 746399, 746532, 746599, 746467, 746279), Sen1Ch3 = c(0,
49, 0, 720709, 722113, 722900, 722515, 722002, 721364, 720681
), Sen2Ch3 = c(0, 294, 0, 734485, 735877, 736650, 736749, 736783,
736664, 736513), Sen1Ch4 = c(0, 61, 0, 732332, 732529, 732487,
731524, 730678, 729723, 728756), Sen2Ch4 = c(0, 222, 0, 737261,
737172, 736976, 736329, 735869, 735302, 734762), Sen1Ch5 = c(0,
59, 0, 765776, 767327, 768116, 767883, 767617, 767121, 766567
), Sen2Ch5 = c(0, 248, 0, 775632, 777074, 777800, 777883, 777970,
777832, 777655), Sen1Ch6 = c(0, 57, 0, 899145, 901398, 902644,
902723, 902737, 902436, 902095), Sen2Ch6 = c(0, 352, 0, 926157,
928263, 929423, 929746, 930043, 930042, 930025), Sen1Ch7 = c(0,
45, 0, 845802, 848332, 849736, 849960, 850137, 849979, 849764
), Sen2Ch7 = c(0, 360, 0, 867160, 869852, 871321, 871830, 872308,
872428, 872500), RefCh1 = c(0, 10100, 0, 908802, 911770, 913546,
914536, 915344, 915862, 916336), RefCh2 = c(0, 6200, 0, 940232,
942473, 943743, 944281, 944794, 945037, 945218), RefCh3 = c(0,
6200, 0, 947069, 948944, 950017, 950484, 950890, 951100, 951271
), RefCh4 = c(0, 14700, 0, 900977, 901433, 901543, 901167, 900974,
900630, 900271), RefCh5 = c(0, 8250, 0, 908355, 910304, 911295,
911674, 912045, 912133, 912179), RefCh6 = c(0, 6200, 0, 939365,
941703, 942978, 943500, 943980, 944147, 944314), RefCh7 = c(0,
6200, 0, 941728, 944713, 946375, 947078, 947774, 948077, 948325
)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"
))
You can feed dynamic variable names to mutate with !!sym for example:
for(i in 1:7){
out <- df %>%
mutate(!!sym(sprintf("ATN%s.1",i)) := ifelse(Status == 1, NA_integer_, -100 * log(!!sym(paste0("Sen1Ch",i))/!!sym(paste0("RefCh",i)))))
}
Note you need := inside the mutate.
Here is a base r solution with mapply. First define an auxiliary function f to make the code more readable, then get the column names to be changed and that take part in the formula with regular expressions, finally, csall the function f in a mapply loop.
f <- function(x, y, Status) {
ifelse(Status == 1, NA_integer_, -100 * log(x/y))
}
atn <- grep("^ATN\\d\\.1$", names(df), value = TRUE)
sen1ch <- grep("^Sen1Ch", names(df), value = TRUE)
refch <- grep("^RefCh", names(df), value = TRUE)
df[atn] <- mapply(f, df[sen1ch], df[refch], MoreArgs = list(Status = df$Status))
df
#> # A tibble: 10 x 36
#> Status ATN1.1 ATN2.1 ATN3.1 ATN4.1 ATN5.1 ATN6.1 ATN7.1 ATN0.1.1 ATN0.2.1
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 NA NA NA NA NA NA NA NA NA
#> 2 17 463. 476. 484. 548. 494. 469. 493. NA NA
#> 3 1 NA NA NA NA NA NA NA NA NA
#> 4 1 NA NA NA NA NA NA NA NA NA
#> 5 1 NA NA NA NA NA NA NA NA NA
#> 6 1 NA NA NA NA NA NA NA NA NA
#> 7 2 15.1 24.1 27.4 20.9 17.2 4.42 10.8 15.1 24.1
#> 8 0 15.2 24.3 27.5 21.0 17.2 4.47 10.9 15.1 24.1
#> 9 0 15.4 24.4 27.6 21.0 17.3 4.52 10.9 15.1 24.1
#> 10 0 15.6 24.5 27.8 21.1 17.4 4.57 11.0 15.1 24.1
#> # ... with 26 more variables: ATN0.3.1 <dbl>, ATN0.4.1 <dbl>, ATN0.5.1 <dbl>,
#> # ATN0.6.1 <dbl>, ATN0.7.1 <dbl>, Sen1Ch1 <dbl>, Sen2Ch1 <dbl>,
#> # Sen1Ch2 <dbl>, Sen2Ch2 <dbl>, Sen1Ch3 <dbl>, Sen2Ch3 <dbl>, Sen1Ch4 <dbl>,
#> # Sen2Ch4 <dbl>, Sen1Ch5 <dbl>, Sen2Ch5 <dbl>, Sen1Ch6 <dbl>, Sen2Ch6 <dbl>,
#> # Sen1Ch7 <dbl>, Sen2Ch7 <dbl>, RefCh1 <dbl>, RefCh2 <dbl>, RefCh3 <dbl>,
#> # RefCh4 <dbl>, RefCh5 <dbl>, RefCh6 <dbl>, RefCh7 <dbl>
Created on 2022-04-14 by the reprex package (v2.0.1)

transpose from one variable under another in R

here example of my data
mydat=structure(list(ADR.N.14.0 = c(8140010250001, 8140010250002),
NOMYAR.N.16.6 = c(1, 1), KOFPOR1.N.16.6 = c(7, 10), POR1.C.254 = c("о",
"BB"), VOZPOR1.N.16.6 = c(80, 45), VYSPOR1.N.16.6 = c(24,
17), DEMPOR1.N.16.6 = c(36, 16), POLNOT1.N.16.6 = c(0.6,
0.9), ZAPZAH1.N.16.6 = c(210, 160), NOMYAR2.N.16.6 = c(1,
1), KOFSOCT2.N.16.6 = c(3, 0), POR2.C.254 = c("BB", "о"),
VOZPOR2.N.16.6 = c(70, 45), VYSPOR2.N.16.6 = c(22, 17), DEMPOR2.N.16.6 = c(26,
22), POLNOT2.N.16.6 = c(0, 0), ZAPZAH2.N.16.6 = c(0, 0)), class = "data.frame", row.names = c(NA,
-2L))
how for each value of ADR,N,14,0move data from one variable under another.
To be more clear
here variables with prefix1
NOMYAR,N,16,6 KOFPOR**1**,N,16,6 POR**1**,C,254 VOZPOR**1**,N,16,6 VYSPOR**1**,N,16,6 DEMPOR**1**,N,16,6 POLNOT**1**,N,16,6 ZAPZAH**1**,N,16,6
and near rows with prefix2
NOMYAR**2**,N,16,6 KOFPOR**2**,N,16,6 POR**2**,C,254 VOZPOR**2**,N,16,6 VYSPOR**1**,N,16,6 DEMPOR**2**,N,16,6 POLNOT**2**,N,16,6 ZAPZAH**2**,N,16,6
so i need that for for ADR,N,14,0 =8140010250001
the content of the fields with the prefix 2 was under the content of the fields with the prefix 1
like this
result=structure(list(ADR.N.14.0 = c(8140010250001, 8140010250001, 8140010250002,
8140010250002, NA, NA, NA, NA, NA, NA), NOMYAR.N.16.6 = c(1,
1, 1, 1, NA, NA, NA, NA, NA, NA), KOFPOR1.N.16.6 = c(7, 3, 10,
0, NA, NA, NA, NA, NA, NA), POR1.C.254 = c("о", "BB", "BB", "о",
"", "", "", "", "", ""), VOZPOR1.N.16.6 = c(80, 70, 45, 45, NA,
NA, NA, NA, NA, NA), VYSPOR1.N.16.6 = c(24, 22, 17, 17, NA, NA,
NA, NA, NA, NA), DEMPOR1.N.16.6 = c(36, 26, 16, 22, NA, NA, NA,
NA, NA, NA), POLNOT1.N.16.6 = c(0.6, 0, 0.9, 0, NA, NA, NA, NA,
NA, NA), ZAPZAH1.N.16.6 = c(210, 0, 160, 0, NA, NA, NA, NA, NA,
NA)), class = "data.frame", row.names = c(NA, -10L))
How can i do such transpose?
You can use pivot_longer and specify names_pattern to include pattern of names that you want together.
tidyr::pivot_longer(mydat, cols = -ADR.N.14.0,
names_to = c('.value'),
names_pattern = '(.*?)\\d?\\..*')
# ADR.N.14.0 NOMYAR KOFPOR POR VOZPOR VYSPOR DEMPOR POLNOT ZAPZAH KOFSOCT
# <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 8140010250001 1 7 о 80 24 36 0.6 210 3
#2 8140010250001 1 NA BB 70 22 26 0 0 NA
#3 8140010250002 1 10 BB 45 17 16 0.9 160 0
#4 8140010250002 1 NA о 45 17 22 0 0 NA

How can I locate the first instance of NA in a row within a data frame?

How can I see where (in which column) the first NA appears in a row within a data frame?
I am looking at points where participants drop off in a process which is 10 steps long.
Each step is identified via a corresponding column, meaning 10 columns in total.
The way that I can tell if somebody has completed any one step is if I see a date time value in the column which indicates the moment they completed that step.
If they haven't completed the step, it will display NA and so will the following columns.
For example, if I see NA in column 5 for a particular row, then I know that that particular user did not proceed past step 4 as the remaining columns will also show NA.
The idea is that participants complete all 10 steps, meaning that they have completed the process in full.
I want to be able to identify the most common drop-off point.
My dataset is 2,000 rows deep - how can I check and/or identify this quickly?
Sample data:
structure(list(associate = c("tXQCMHwGFy", "JzObuwUnkJ", "2fM04XFVja",
"uFsZTj2i2M", "ZsI0u5ka2j", "9r98DMXxFE", "NtmXw4qnIa", "oGB0Ugi93h",
"G0r2yOxM7s", "MIpQqbBagS", "HCGJ5kSOlk", "3ljP9FuGcA", "5k7OvbBZUH",
"6DDEbTWhBD", "xuU5Ewninw", "5UGABh3kcg", "G5etNVDoEH", "ejlCBv3dp2",
"2DUWxEFt6o", "sCJeaxCSk5", "sb9QKBDSHl", "E8n3XZSS1x", "Ld7rFWFKag",
"ykziBo9kOx", "Z9mOsGpDNE"), accountCreation = structure(c(1524606379.904,
1528147858.812, 1521994536.637, 1522097826.043, 1528150007.134,
1526575446.645, 1523493362.438, 1528123246.558, 1528135004.808,
1527791947.924, 1526755863.609, 1525455650.394, 1523409400.766,
1524347073.427, 1526134766.407, 1523638698.97, 1527878066.61,
1524855389.236, 1526309009.378, 1520972884.396, 1527180696.03,
1527268883.689, 1521646455.016, 1526837992.595, 1521040859.622
), class = c("POSIXct", "POSIXt")), profileSetup = structure(c(1524606693.345,
1528148032.015, 1521994616.897, 1522097826.043, 1528186485.637,
1526575497.987, 1523493556.798, 1528123314.197, 1528135180.95,
1527792152.877, 1526756131.911, 1525455787.847, 1523409400.766,
1524347073.427, 1526134850.566, 1523638905.289, 1527878482.462,
1524855535.686, 1526309106.294, 1522186725.043, 1527180799.909,
1527269009.143, 1521646455.016, 1526838102.323, 1521040859.622
), class = c("POSIXct", "POSIXt")), profilesetupDuration = c(314,
174, 80, 0, 36478, 51, 194, 68, 176, 205, 268, 137, 0, 0, 84,
207, 416, 146, 97, 1213841, 103, 126, 0, 110, 0), introductionSplash = structure(c(1524872052.263,
1528148043.062, 1521995730.924, 1522097826.043, 1528186496.499,
1526575506.96, 1523493567.959, 1528123329.044, 1528135237.755,
1527792185.349, NA, 1525455815.855, 1523409400.766, 1524347073.427,
1526134861.747, 1523638967.684, 1527878727.235, 1524855546.038,
1526309117.104, 1522186739.397, NA, 1527269018.641, 1521646455.016,
1526838112.374, 1521040859.622), class = c("POSIXct", "POSIXt"
)), introductionSplashDuration = c(265673, 185, 1194, 0, 36489,
60, 205, 83, 233, 238, NA, 165, 0, 0, 95, 269, 661, 157, 108,
1213855, NA, 135, 0, 120, 0), introduction = structure(c(1525124180.491,
1528148744.594, 1521996568.337, 1522097826.043, NA, 1526576050.815,
1523495507, 1528126805.572, NA, 1527792470.951, NA, 1525456759.777,
1523409400.766, 1524347073.427, 1526135265.531, 1523639316.761,
1527878956.368, 1524861227.537, 1526310376.89, 1522187755.31,
NA, 1527269672.153, 1521646455.016, 1526838283.459, 1521040859.622
), class = c("POSIXct", "POSIXt")), introductionDuration = c(517801,
886, 2032, 0, NA, 604, 2145, 3559, NA, 523, NA, 1109, 0, 0, 499,
618, 890, 5838, 1367, 1214871, NA, 789, 0, 291, 0), demoChatSkipped = structure(c(NA,
1528148761.447, NA, 1522097826.043, NA, 1526576060.249, NA, NA,
NA, 1527792487.742, NA, 1525456803.893, 1523409400.766, 1524347073.427,
1526147587.803, NA, NA, NA, NA, NA, NA, 1527269694.132, 1521646455.016,
1526838287.934, 1521040859.622), class = c("POSIXct", "POSIXt"
)), demoChatSkippedDuration = c(NA, 903, NA, 0, NA, 614, NA,
NA, NA, 540, NA, 1153, 0, 0, 12821, NA, NA, NA, NA, NA, NA, 811,
0, 295, 0), approval = structure(c(1525124264.718, 1528148756.313,
1522018833.517, 1522097826.043, NA, 1526576055.489, 1523538955.529,
1528136805.681, NA, 1527792479.256, NA, 1525456805.673, 1523409400.766,
1524347073.427, 1526147585.05, 1523639448.648, 1527879134.158,
1524861732.505, 1526315087.819, 1522188033.261, 1527180827.746,
1527269692.115, 1521646455.016, 1526838288.734, 1521040859.622
), class = c("POSIXct", "POSIXt")), approvalDuration = c(517885,
898, 24297, 0, NA, 609, 45593, 13559, NA, 532, NA, 1155, 0, 0,
12819, 750, 1068, 6343, 6078, 1215149, 131, 809, 0, 296, 0),
tutorial = structure(c(NA, NA, NA, 1522097826.043, NA, NA,
NA, NA, NA, NA, NA, NA, 1523409400.766, 1524347073.427, NA,
NA, NA, NA, NA, NA, NA, NA, 1521646455.016, NA, 1521040859.622
), class = c("POSIXct", "POSIXt")), tutorialDuration = c(NA,
NA, NA, 0, NA, NA, NA, NA, NA, NA, NA, NA, 0, 0, NA, NA,
NA, NA, NA, NA, NA, NA, 0, NA, 0), letsbegin = structure(c(1525124456.616,
1528148773.37, 1522031049.317, 1522097826.043, NA, 1526576071.6,
1523538956.159, 1528136822.297, NA, 1527794019.564, NA, 1525456849.582,
1523409400.766, 1524347073.427, 1526312517.824, 1523639449.148,
1527879134.675, 1524861750.153, 1526317200.235, 1522188066.352,
1527180828.158, NA, 1521646455.016, 1527015876.057, 1521040859.622
), class = c("POSIXct", "POSIXt")), letsbeginDuration = c(518077,
915, 36513, 0, NA, 625, 45594, 13576, NA, 2072, NA, 1199,
0, 0, 177751, 751, 1068, 6361, 8191, 1215182, 132, NA, 0,
177884, 0), demoChatDuration = c(517884, NA, 24297, NA, NA,
NA, 2499, 13559, NA, NA, NA, NA, NA, NA, 13201, 729, 1029,
6342, 6078, 1215148, NA, 967, NA, NA, NA)), row.names = c(937L,
1941L, 396L, 30L, 1950L, 1337L, 602L, 1812L, 1872L, 1719L, 1423L,
1077L, 173L, 234L, 1204L, 680L, 1748L, 989L, 1243L, 251L, 1568L,
1615L, 196L, 1451L, 154L), class = "data.frame")
If you want the name of the column added to your data.frame, something like this will do:
create some data, first 10 records of mtcars. replicate your situation with NA's to the end.
df <- mtcars[1:10, ]
df[3, 3:11] <- NA
df[6, 5:11] <- NA
df$dropofpoint <- apply(df, 1, function(x) names(which(is.na(x)))[1])
head(df)
mpg cyl disp hp drat wt qsec vs am gear carb dropofpoint
Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 <NA>
Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 <NA>
Datsun 710 22.8 4 NA NA NA NA NA NA NA NA NA disp
Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 <NA>
Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 <NA>
Valiant 18.1 6 225 105 NA NA NA NA NA NA NA drat
If you want a quick solution, I would vectorize this using max.col
res <- max.col(is.na(df), ties = "first")
Though max.col will return 1 even if there were no NAs at all in a specific row. Hence, you can add the following line to handle those specific cases
if(any(res == 1)) is.na(res) <- (res == 1) & !is.na(df[[1]])
This will convert those cases to NA- meaning that a column index for that row wasn't found
In the following code I assume that you want to return a special value in case there are no NA in a row.
set.seed(5239) # Make the results reproducible
dat <- matrix(1:40, 4)
dat[sample(40, 5)] <- NA
dat <- as.data.frame(dat) # Not strictly needed
apply(dat, 1, function(x) {
w <- which(is.na(x))
if(length(w) > 0) min(w) else Inf
})
#[1] 4 6 Inf 2
The special value here is Inf. You can change this to, for instance, length(x) or another value of your choice.
Here's an example:
## some fake data
Data <- matrix(c(0,0,0,NA,0,NA,NA,NA,0,0,NA,NA), nrow = 3, byrow = TRUE)
## which ones are the first NA's per row
## Edited to avoid the warning message
apply(Data,1, function(fo) ifelse(any(is.na(fo)),min(which(is.na(fo))), NA))
Consider a data frame which has 7 rows and 3 columns, with a first na in 4th row 2nd column.
df <- read.table(text = "rowname value test
A 3 8
B 1 3
C 2 8
D NA 5
E 2 3
F NA 4
G 6 9", header = TRUE)
Then the position of first na value in the dataframe can be found using which and is.na.
For eg: first na is found at index 1. Can loop through each index to get the subsequent nas in order.
pos_first_na <- which(is.na(df))[1]
To find the exact row and columns:
col_pos <- pos_first_na %% ncol(df)
row_pos <- pos_first_na %% nrow(df)
df[row_pos,col_pos]
This gives:
> row_pos
[1] 4
> col_pos
[1] 2
> df[row_pos,col_pos]
[1] NA
Consider the below solution, replacing x with your data:
# Loop through each row of x
apply(X = x, MARGIN = 1, FUN = function(row) {
# Return smallest index of NA values
min(which(is.na(row)))
})

Conditionally replace elements of a vector based on an index

It's best explained with an example.
I have a vector, or column from data.frame named vec:
vec <- c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA)
I would like a vectorized process (not a for loop) to change the three trailing NA when a 1 is observed.
The end vector would be:
c(NA, NA, 1, 1, 1, 1, NA, 1, 1, 1, 1, NA, NA, NA)
If we had:
vec <- c(NA, NA, 1, NA, 1, NA, NA, 1, NA, NA, NA, NA, NA, NA)
The end vector would look like:
c(NA, NA, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, NA, NA)
A very badly written solution is:
vec2 <- vec
for(i in index(v)){
if(!is.na(v[i])) vec2[i] <- 1
if(i>3){
if(!is.na(vec[i-1])) vec2[i] <- 1
if(!is.na(vec[i-2])) vec2[i] <- 1
if(!is.na(vec[i-3])) vec2[i] <- 1
}
if(i==3){
if(!is.na(vec[i-1])) vec2[i] <- 1
if(!is.na(vec[i-2])) vec2[i] <- 1
}
if(i==2){
if(!is.na(vec[i-1])) vec2[i] <- 1
}
}
Another option:
`[<-`(vec,c(outer(which(vec==1),1:3,"+")),1)
# [1] NA NA 1 1 1 1 NA 1 1 1 1 NA NA NA
Although the above works with the examples, it stretches the length of vec if a 1 is found in the last positions. Better to make a simple check and wrap into a function:
threeNAs<-function(vec) {
ind<-c(outer(which(vec==1),1:3,"+"))
ind<-ind[ind<=length(vec)]
`[<-`(vec,ind,1)
}
Another fast solution:
vec[rep(which(vec == 1), each = 3) + c(1:3)] <- 1
which gives:
> vec
[1] NA NA 1 1 1 1 NA 1 1 1 1 NA NA NA
Benchmarking is only really useful when done on larger datasets. A benchmark with a 10k larger vector and the several posted solutions:
library(microbenchmark)
microbenchmark(ans.jaap = {vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
vec[rep(which(vec == 1), each = 3) + c(1:3)] <- 1},
ans.989 = {vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
r <- which(vec==1);
vec[c(mapply(seq, r, r+3))] <- 1},
ans.sotos = {vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
vec[unique(as.vector(t(sapply(which(vec == 1), function(i) seq(i+1, length.out = 3)))))] <- 1},
ans.gregor = {vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
vec[is.na(vec)] <- 0;
n <- length(vec);
vec <- vec + c(0, vec[1:(n-1)]) + c(0, 0, vec[1:(n-2)]) + c(0, 0, 0, vec[1:(n-3)]);
vec[vec == 0] <- NA},
ans.moody = {vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
output <- sapply(1:length(vec),function(i){any(!is.na(vec[max(0,i-3):i]))});
output[output] <- 1;
output[output==0] <- NA},
ans.nicola = {vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
`[<-`(vec,c(outer(which(vec==1),1:3,"+")),1)})
which gives the following benchmark:
Unit: microseconds
expr min lq mean median uq max neval cld
ans.jaap 1778.905 1937.414 3064.686 2100.595 2257.695 86233.593 100 a
ans.989 87688.166 89638.133 96992.231 90986.269 93326.393 182431.366 100 c
ans.sotos 125344.157 127968.113 132386.664 130117.438 132951.380 214460.174 100 d
ans.gregor 4036.642 5824.474 10861.373 6533.791 7654.587 87806.955 100 b
ans.moody 173146.810 178369.220 183698.670 180318.799 184000.062 264892.878 100 e
ans.nicola 966.927 1390.486 1723.395 1604.037 1904.695 3310.203 100 a
What really is 'vectorised', if not a loop written in a C-language?
Here's a C++ loop that benchmarks well.
vec <- c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA)
library(Rcpp)
cppFunction('NumericVector fixVec(NumericVector myVec){
int n = myVec.size();
int foundCount = 0;
for(int i = 0; i < n; i++){
if(myVec[i] == 1) foundCount = 1;
if(ISNA(myVec[i])){
if(foundCount >= 1 & foundCount <= 3){
myVec[i] = 1;
foundCount++;
}
}
}
return myVec;
}')
fixVec(vec)
# [1] NA NA 1 1 1 1 NA 1 1 1 1 NA NA NA
Benchmarks
library(microbenchmark)
microbenchmark(
ans.jaap = {
vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
vec[rep(which(vec == 1), each = 4) + c(0:3)] <- 1
},
ans.nicola = {
vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
`[<-`(vec,c(outer(which(vec==1),0:3,"+")),1)
},
ans.symbolix = {
vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4);
vec <- fixVec(vec)
}
)
# Unit: microseconds
# expr min lq mean median uq max neval
# ans.jaap 2017.789 2264.318 2905.2437 2579.315 3588.4850 4667.249 100
# ans.nicola 1242.002 1626.704 3839.4768 2095.311 3066.4795 81299.962 100
# ans.symbolix 504.577 533.426 838.5661 718.275 966.9245 2354.373 100
vec <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4)
vec <- fixVec(vec)
vec2 <- rep(c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA),1e4)
vec2[rep(which(vec2 == 1), each = 4) + c(0:3)] <- 1
identical(vec, vec2)
# [1] TRUE
The following code does what you asked for. It involves "shifting" the vector and then adding the shifted versions
vec[is.na(vec)] <- 0
n <- length(vec)
vec <- vec + c(0, vec[1:(n-1)]) + c(0, 0, vec[1:(n-2)]) + c(0, 0, 0, vec[1:(n-3)])
vec[vec == 0] <- NA
vec[vec != 0] <- 1
# vec | 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0 ,0, 0
# c(0, vec[1:(n-1)]) | + 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0 ,0, 0
# c(0, 0, vec[1:(n-2)]) | + 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0 ,0
# c(0,0,0,vec[1:(n-3)]) | + 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0
# |-------------------------------------------
# | 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0
A non-Vectorized solution, but nevertheless, another option using base R,
vec[unique(as.vector(t(sapply(which(vec == 1), function(i) seq(i+1, length.out = 3)))))] <- 1
vec
#[1] NA NA 1 1 1 1 NA 1 1 1 1 NA NA NA
vec1[unique(as.vector(t(sapply(which(vec1 == 1), function(i) seq(i+1, length.out = 3)))))] <- 1
vec1
#[1] NA NA 1 1 1 1 1 1 1 1 1 NA NA NA
How about this:
r <- which(vec==1)
vec[c(mapply(seq, r, r+3))] <- 1
Examples:
vec <- c(NA, NA, 1, NA, NA, NA, NA, 1, NA, NA, NA, NA, NA, NA)
#[1] NA NA 1 1 1 1 NA 1 1 1 1 NA NA NA
vec <- c(NA, NA, 1, NA, 1, NA, NA, 1, NA, NA, NA, NA, NA, NA)
#[1] NA NA 1 1 1 1 1 1 1 1 1 NA NA NA
With sapply, any, and is.na:
output <- sapply(1:length(vec),function(i){any(!is.na(vec[max(0,i-3):i]))})
output[output] <- 1
output[output==0] <- NA

Resources