Conditionally counting number of ocurrences in a dataframe - performance improvement - r

I need to detect (among other things) the first occurrence of a non-"F" code in a patient's list, after the first "F" code occurrence. The below code seems to succeed in this, however it is shown to be too inefficient on the server running in a data set of one million observations.
The final data set should have a variable of number of non-F codes (nhosp), and the first non-F code found after the first F-code appearance on the DAIGNOSTICO variable. No duplicates of ID.
How can I improve both in terms of complexity and speed? Tidyverse pipe preferred.
This is how the result should look like:
# A tibble: 7 × 6
# Groups: ID [7]
ID DAIGNOSTICO data_entrada data_saida nhosp ficd
<dbl> <chr> <date> <date> <dbl> <chr>
1 1555 F180 1930-04-05 2005-03-15 1 T124
2 1234 F100 1980-04-01 2005-03-02 2 O155
3 16666 F120 1990-06-05 2005-03-18 0 <NA>
4 123456 F145 2001-03-07 2005-03-11 2 T123
5 177778 F155 2001-04-13 2005-03-22 2 G123
6 166666 F125 2002-03-12 2005-03-19 2 W345
7 12345 F150 2002-06-03 2005-03-07 4 K709
This is how my code looks like currently:
library(readr)
library(dplyr)
library(tidyr)
simulation <- read_csv("SIMULADO.txt", col_types = cols(
data_entrada = col_date("%d/%m/%Y"),
data_saida = col_date("%d/%m/%Y")
)
)
simulation <- as.data.frame(simulation)
simulation[, "nhosp"] <- 0
oldpos <- 1
for (i in 1:nrow(simulation)) {
if (grepl("F", simulation[i, "DAIGNOSTICO"], )) { # Has F?
oldpos <- i
clin <- 0
simulation[i, "hasF"] <- T
} else {
simulation[i, "hasF"] <-F
}
if (simulation[i, "ID"] == simulation[oldpos, "ID"]) { # same person?
if (simulation[oldpos, "hasF"] == T) { # Did she/him had F?
simulation[i, "hasF"] <- T
if (simulation[i, "data_entrada"] > simulation[oldpos, "data_entrada"]) { # é subsequente?
if (!grepl("F", simulation[i, "DAIGNOSTICO"], )) { # not-F?
simulation[i,"hasC"] <- T
clin <- 1
simulation[i, "ficd"] <- simulation[i, "DAIGNOSTICO"]
simulation[i, "nhosp"] <- clin
first_cc <- simulation[i, "DAIGNOSTICO"]
}
}
}
}
}
dt1 <- simulation %>%
arrange(data_entrada) %>%
group_by(ID) %>%
select(ficd) %>%
drop_na() %>%
slice(1)
dt2 <- simulation %>%
arrange(data_entrada) %>%
group_by(ID) %>%
filter(hasF == T) %>%
mutate(nhosp = cumsum(nhosp),
nhosp = max(nhosp)) %>%
select(-ficd,-hasF, -hasC) %>%
distinct(ID, .keep_all = TRUE) %>%
full_join(dt1, by = "ID")
dt2
And this is an example data set, with some errors to check robustness of the code:
ID, DAIGNOSTICO, data_entrada, data_saida
123490, O100, 01/04/1980, 02/03/2005
123490, O100, 01/04/1981, 02/03/2005
123491, O101, 01/04/1980, 02/03/2005
123491, O101, 01/04/1981, 02/03/2005
1234, F100, 01/04/1980, 02/03/2005
1234, O155, 02/04/1980, 03/03/2005
1234, G123, 05/05/1982, 04/03/2005
12345, T124, 01/06/2002, 05/03/2005
12345, Y124, 02/06/2002, 06/03/2005
12345, F150, 03/06/2002, 07/03/2005
12345, K709, 04/06/2002, 08/03/2005
12345, Y709, 05/06/2002, 09/03/2005
12345, F150, 03/06/2002, 07/03/2005
12345, K710, 06/06/2002, 08/03/2005
12345, K711, 07/06/2002, 10/03/2005
12345, F150, 08/06/2002, 07/03/2005
123456, F145, 07/03/2001, 11/03/2005
123456, T123, 08/03/2001, 12/03/2005
123456, P123, 09/03/2001, 13/03/2005
1555 ,R155, 04/04/1930, 14/03/2005
1555 ,F180, 05/04/1930, 15/03/2005
1555 ,T124, 06/04/1930, 16/03/2005
1555 ,F708, 07/04/1930, 17/03/2005
16666 ,F120, 05/06/1990, 18/03/2005
166666, F125, 12/03/2002, 19/03/2005
166666, W345, 13/03/2002, 20/03/2005
166666, L123, 14/03/2002, 21/03/2005
177778, F155, 13/04/2001, 22/03/2005
177778, G123, 14/04/2001, 23/03/2005
177778, F190, 15/04/2001, 24/03/2005
177778, E124, 16/04/2001, 25/03/2005
177779, G155, 13/04/2001, 22/03/2005
177779, G123, 14/04/2001, 23/03/2005
177779, G190, 15/04/2001, 24/03/2005
177779, E124, 16/04/2001, 25/03/2005

You could use
library(dplyr)
library(stringr)
df %>%
group_by(ID) %>%
filter(cumsum(str_detect(DAIGNOSTICO, "^F")) > 0) %>%
mutate(nhosp = sum(str_detect(DAIGNOSTICO, "^[^F]")),
ficd = lead(DAIGNOSTICO)) %>%
filter(str_detect(DAIGNOSTICO, "^F")) %>%
slice(1) %>%
ungroup()
This returns
# A tibble: 7 x 6
ID DAIGNOSTICO data_entrada data_saida nhosp ficd
<dbl> <chr> <chr> <chr> <int> <chr>
1 1234 F100 01/04/1980 02/03/2005 2 O155
2 1555 F180 05/04/1930 15/03/2005 1 T124
3 12345 F150 03/06/2002 07/03/2005 4 K709
4 16666 F120 05/06/1990 18/03/2005 0 NA
5 123456 F145 07/03/2001 11/03/2005 2 T123
6 166666 F125 12/03/2002 19/03/2005 2 W345
7 177778 F155 13/04/2001 22/03/2005 2 G123
Edit
I think there might be a flaw, perhaps
library(dplyr)
library(stringr)
df %>%
group_by(ID) %>%
filter(
cumsum(str_detect(DAIGNOSTICO, "^F")) == 1 |
!str_detect(DAIGNOSTICO, "^F") & cumsum(str_detect(DAIGNOSTICO, "^F")) > 0
) %>%
mutate(nhosp = sum(str_detect(DAIGNOSTICO, "^[^F]")),
ficd = lead(DAIGNOSTICO)) %>%
filter(str_detect(DAIGNOSTICO, "^F")) %>%
slice(1) %>%
ungroup()
is a better solution.

Related

Aggregating a dataframe with dplyr in R based on several dummy variables

I am using dplyr to aggregate my dataframe, so it shows percentages of people choosing specific protein design tasks by company size. I have different dummy variables for protein design tasks, because this was a multiple choice question in a survey.
I figured out a way to do this, but my code is very long, because I aggregate the data per task and then join all these separate dataframes together into one. I’m curious whether there is a more elegant (shorter) way to do this?
library(tidyverse)
EarlyAccess <- read_csv("https://dropbox.com/s/antzwk1jh4ldrhi/EarlyAccess_anon.csv?dl=1")
#################### STABILITY ################################################
Proportions_tasks_stability <- EarlyAccess %>%
select(size, Improving.stability..generic..thermal..pH.) %>%
group_by(size, Improving.stability..generic..thermal..pH.) %>%
summarise(count_var_stability=n())%>%
mutate(total_group_by_size = sum(count_var_stability)) %>%
mutate(pc_var_stability=count_var_stability/sum(count_var_stability)*100) %>%
filter(Improving.stability..generic..thermal..pH.=="Improving stability (generic, thermal, pH)") %>%
select(size, Improving.stability..generic..thermal..pH., pc_var_stability)
######################## ACTIVITY #############################################
Proportions_tasks_activity <- EarlyAccess %>%
select(size, Improving.activity ) %>%
group_by(size, Improving.activity) %>%
summarise(count_var_activity=n())%>%
mutate(total_group_by_size = sum(count_var_activity)) %>%
mutate(pc_var_activity=count_var_activity/sum(count_var_activity)*100) %>%
filter(Improving.activity=="Improving activity") %>%
select(size, Improving.activity, pc_var_activity)
######################## BINDING AFFINITY ######################################
Proportions_tasks_binding.affinity<- EarlyAccess %>%
select(size, Improving.binding.affinity ) %>%
group_by(size, Improving.binding.affinity) %>%
summarise(count_var_binding.affinity=n())%>%
mutate(total_group_by_size = sum(count_var_binding.affinity)) %>%
mutate(pc_var_binding.affinity=count_var_binding.affinity/sum(count_var_binding.affinity)*100) %>%
filter(Improving.binding.affinity=="Improving binding affinity") %>%
select(size, Improving.binding.affinity, pc_var_binding.affinity)
# Then join them
Protein_design_tasks <- Proportions_tasks_stability %>%
inner_join(Proportions_tasks_activity, by = "size") %>%
inner_join(Proportions_tasks_binding.affinity, by = "size")
Using the datafile you provided, this should give the percentages of the selected category within each column for each size:
library(tidyverse)
df <-
read_csv("https://dropbox.com/s/antzwk1jh4ldrhi/EarlyAccess_anon.csv?dl=1")
df |>
group_by(size) |>
summarise(
pc_var_stability = sum(
Improving.stability..generic..thermal..pH. == "Improving stability (generic, thermal, pH)",
na.rm = TRUE
) / n() * 100,
pc_var_activity = sum(Improving.activity == "Improving activity",
na.rm = TRUE) / n() * 100,
pc_var_binding.affinity = sum(
Improving.binding.affinity == "Improving binding affinity",
na.rm = TRUE
) / n() * 100
)
#> # A tibble: 7 × 4
#> size pc_var_stability pc_var_activity pc_var_binding.affinity
#> <chr> <dbl> <dbl> <dbl>
#> 1 1000-10000 43.5 47.8 34.8
#> 2 10000+ 65 65 70
#> 3 11-50 53.8 53.8 46.2
#> 4 2-10 51.1 46.8 46.8
#> 5 200-1000 64.7 52.9 52.9
#> 6 50-200 42.1 42.1 36.8
#> 7 Just me 48.5 39.4 54.5
Looking at your data, each column has either the string value you're testing for or NA, so you could make it even shorter/tidier just by counting non-NAs in relevant columns:
df |>
group_by(size) |>
summarise(across(
c(
Improving.stability..generic..thermal..pH.,
Improving.activity,
Improving.binding.affinity
),
\(val) 100 * sum(!is.na(val)) / n()
))
If what you're aiming to do is summarise across all columns then the latter method may work best - there are several ways of specifying which columns you want and so you don't necessarily need to type all names and values in. You might also find it clearest to make calculating and formatting all percentages a named function to call:
library(tidyverse)
df <-
read_csv("https://dropbox.com/s/antzwk1jh4ldrhi/EarlyAccess_anon.csv?dl=1",
show_col_types = FALSE)
perc_nonmissing <- function(val) {
sprintf("%.1f%%", 100 * sum(!is.na(val)) / n())
}
df |>
group_by(size) |>
summarise(across(-c(1:2), perc_nonmissing))
#> # A tibble: 7 × 12
#> size Disco…¹ Searc…² Under…³ Impro…⁴ Impro…⁵ Impro…⁶ Impro…⁷ Impro…⁸ Impro…⁹
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 1000-… 21.7% 17.4% 43.5% 47.8% 39.1% 43.5% 30.4% 39.1% 39.1%
#> 2 10000+ 40.0% 55.0% 55.0% 65.0% 70.0% 65.0% 20.0% 30.0% 40.0%
#> 3 11-50 30.8% 26.9% 42.3% 53.8% 38.5% 53.8% 15.4% 30.8% 38.5%
#> 4 2-10 38.3% 40.4% 48.9% 46.8% 36.2% 51.1% 23.4% 31.9% 42.6%
# etc.

How to sum values for each unique group in R

In the dataset below, I want to identify Top 3 time-consuming projects
library(dplyr)
TransID <-c(1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,1011,1014,1018,1022,1023,1024)
EmpID<-c('M001','M001','M001','M001','B005','B005','B005','B005','X101','X101','X101','Z101','K501','K501','K501','K501')
ProjectID <- c(200,200,200,200,500,500,500,500,950,950,950,950,1050,1050,1050,1050)
Site<-c('X','X','X','Y','Y','Y','Z','Z','Z','G','G','G','G','K','K','K')
Region <-c('NE','NW','SE','SW','MW','NW','SW','NE','NC','MW','NE','SE','SW','NC','SW','SE')
hour_difference<-c(1.45,2.14,2.53,3.69,1.73,2.47,3.63,1.59,0.75,1.18,2.78,9.55,1.85,2.39,5.52,0.23)
df = data.frame(TransID,EmpID,ProjectID,Site,Region,hour_difference)
df
Simply,
for each unique ProjectID, I want to sum the hour_difference and sort in descending order
My attempt:
df %>%
group_by(ProjectID,hour_difference) %>%
summarize(sum().sort_values())
Desired output:
for example, ProjectID = 950 will have a sum of 14.26
I'm confused about descending order of ProjectID or sum of hour_difference but you may try
sum(hour_difference)
df %>%
group_by(ProjectID) %>%
summarise(res = sum(hour_difference)) %>%
arrange(desc(res))
ProjectID res
<dbl> <dbl>
1 950 14.3
2 1050 9.99
3 200 9.81
4 500 9.42
ProjectID
df %>%
group_by(ProjectID) %>%
summarise(res = sum(hour_difference)) %>%
arrange(desc(ProjectID))
ProjectID res
<dbl> <dbl>
1 1050 9.99
2 950 14.3
3 500 9.42
4 200 9.81

Rvest returning empty values and can't figure out why

I have a pretty templatized for this kind of thing and I've never seen it not work.
library(tidyverse)
library(rvest)
library(magrittr)
library(dplyr)
library(tidyr)
library(data.table)
library(zoo)
rivals_url <- paste0("https://rivals.com/prospect_rankings/rivals250/2021")
t300 <- map_df(rivals_url, ~.x %>% read_html %>%
html_nodes(".position .pos , .last-name , .first-name") %>%
html_text() %>%
str_trim %>%
str_split(" ") %>%
matrix(ncol = 3, byrow = T) %>%
as.data.frame)
When I run it, it just returns an empty set of values. Is there something specific about where I'm pulling from that causes this. For example, this script works fine:
espn_url <- paste0("http://www.espn.com/college-sports/football/recruiting/playerrankings/_/view/rn300")
t300 <- map_df(espn_url, ~.x %>% read_html %>%
html_nodes("td:nth-child(3), td:nth-child(8), Strong, .colhead td:nth-child(2)") %>%
html_text() %>%
str_trim %>%
str_split(" ") %>%
matrix(ncol = 3, byrow = T) %>%
as.data.frame)
The reason your code does not work is because you are trying to scrape a dynamic page. You will need to use a package such as RSelenium. The below code should work:
library(tidyverse)
library(rvest)
library(magrittr)
library(dplyr)
library(tidyr)
library(data.table)
library(zoo)
library(RSelenium)
rivals_url <- paste0("https://rivals.com/prospect_rankings/rivals250/2021")
#####Open remote browser
rD <- rsDriver(browser = "chrome")
remDr <- rD[["client"]]
remDr$navigate(rivals_url)
rivals_page <- read_html(remDr$getPageSource()[[1]])
##### Scrape target page and format results
t300 <- rivals_page %>%
html_nodes(".position .pos , .last-name , .first-name") %>%
html_text() %>%
str_trim %>%
str_split(" ") %>%
matrix(ncol = 3, byrow = T) %>%
as.data.frame %>%
`colnames<-`(c("Last Name", "First Name", "Position"))
##### Close remote browser and terminate related processes
remDr$close()
rD$server$stop()
rm(rD, remDr)
gc()
system("taskkill /im java.exe /f", intern=FALSE, ignore.stdout=FALSE)
The site has a public facing API you can query directly:
library(jsonlite)
library(dplyr)
dat <- fromJSON("https://n.rivals.com/api/v1/ranks/4408/prospects?start=0&position=ALL%20POSITIONS&pageSize=250")
dat$prospects %>%
as_tibble()
# A tibble: 250 x 26
id first_name last_name name position_abbrev~ stars prospect_url height weight verified_height verified_weight city hometown highschool profile_image_u~
<int> <chr> <chr> <chr> <chr> <int> <chr> <dbl> <dbl> <lgl> <lgl> <chr> <chr> <chr> <chr>
1 232205 "Maason" Smith Maas~ DT 5 https://n.r~ 77 297 NA NA Houma Houma, ~ Terrebonne https://images.~
2 209086 "JC" Latham JC L~ OT 5 https://n.r~ 78 310 NA NA Brad~ Bradent~ IMG Acade~ https://images.~
3 216686 "Korey" Foreman Kore~ SDE 5 https://n.r~ 77 254 TRUE FALSE Coro~ Corona,~ Centennial https://images.~
4 234618 "Amarius" Mims Amar~ OT 5 https://n.r~ 79 300 NA NA Coch~ Cochran~ Bleckley ~ https://images.~
5 210175 "Caleb" Williams Cale~ DUAL 5 https://n.r~ 73 200 NA NA Wash~ Washing~ Gonzaga https://images.~
6 208999 "J.T." Tuimoloau J.T.~ SDE 5 https://n.r~ 76 275 NA NA Bell~ Bellevu~ Eastside ~ https://images.~
7 210587 "Brock" Vandagri~ Broc~ PRO 5 https://n.r~ 75 198 NA NA Boga~ Bogart,~ Prince Av~ https://images.~
8 234013 "Tommy " Brockerm~ Tomm~ OT 5 https://n.r~ 78 288 NA NA Fort~ Fort Wo~ All Saint~ https://images.~
9 237486 "Ty" Thompson Ty T~ DUAL 5 https://n.r~ 76 203 NA NA Gilb~ Gilbert~ Mesquite https://images.~
10 194402 "Camar" Wheaton Cama~ RB 5 https://n.r~ 73 190 NA NA Garl~ Garland~ Lakeview ~ https://images.~
# ... with 240 more rows, and 11 more variables: state_abbreviation <chr>, school_id <int>, sport_school_id <int>, school_name <chr>, school_commits_url <chr>,
# recruit_year <int>, big_college_logo <chr>, status <chr>, commit_date <chr>, rank <int>, rank_change <int>

How to add column to represent the year variable, as the data is scraped?

How can I scrape the data and add and additional column to show the year that it is scraped?
nba_drafts <- function(year) {
url <- glue("https://www.basketball-reference.com/draft/NBA_{year}.html")
tables<-read_html(url) %>%
html_nodes("#stats") %>%
html_table() %>%
as.tibble() %>%
add_column(year = year)
write.csv(tables, year, file = "nba_draftsR.csv", na ="")
}
2000:2017 %>%
walk(function(year) {
nba_drafts(year)
})
Error: Column 1 must be named.
Checked your code, the error is happening at the step highlighted in below code.
tables<-read_html(url) %>%
html_nodes("#stats") %>%
html_table() %>%
as.tibble() %>% # error is happening at this step
Debug Step:
The reason for this error is the first three columns names are balnks(""), which you need to assign first, then only you can change to tibble or data frame.
tables<-read_html(url) %>%
html_nodes("#stats") %>%
html_table() %>%
purrr::simplify() %>%
first()
names(tables)
[1] "" "" "" "Round 1" "Round 1" "" "Totals" "Totals" "Totals" "Totals" "Totals"
[12] "Shooting" "Shooting" "Shooting" "Per Game" "Per Game" "Per Game" "Per Game" "Advanced" "Advanced" "Advanced" "Advanced"
I have added a for loop to update the names
nba_drafts <- function(year) {
url <- glue("https://www.basketball-reference.com/draft/NBA_{year}.html")
tables<-read_html(url) %>%
html_nodes("#stats") %>%
html_table() %>%
purrr::simplify() %>%
first()
oldName<-names(tables)
#updating names with col_
for(i in 1:length(oldName)){
oldName[i]<- paste0("col_",i,oldName[i])
}
names(tables)<-oldName
tables<-tables %>%
as.tibble() %>%
add_column(year = year)
return(tables)
}
Output:
> nba_drafts("2019")
# A tibble: 63 x 23
col_1 col_2 col_3 `col_4Round 1` `col_5Round 1` col_6 col_7Totals col_8Totals col_9Totals col_10Totals col_11Totals
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 Rk Pk Tm Player College Yrs G MP PTS TRB AST
2 1 1 NOP Zion Williams… Duke 1 19 565 448 129 41
3 2 2 MEM Ja Morant Murray State 1 59 1771 1041 208 409
4 3 3 NYK RJ Barrett Duke 1 56 1704 803 279 143
5 4 4 LAL De'Andre Hunt… Virginia 1 63 2018 778 286 112
6 5 5 CLE Darius Garland Vanderbilt 1 59 1824 728 111 229

how to scrape text from a HTML body

I've never scraped. Would it be straightforward to scrape the text in the main, big gray box only from the link below (starting with header SRUS43 KMSR 271039, ending with .END)? My end goal is to basically have three tidy columns of data from all that text: the five digit codes, the values in inches, and the basin elevation descriptions, so any pointers with processing the text format are welcome, too.
https://www.nohrsc.noaa.gov/shef_archive/index.html?rfc=cnrfc&product=swe&year=2019&month=3&day=27&hour=6
thank you for any help.
Reading in the text is fairly easy (see #DiceBoyT answer). Cleaning up the format for three columns is a bit more involved. Below could use some clean-up (especially with the regex), but it gets the job done:
library(tidyverse)
library(rvest)
text <- read_html("https://www.nohrsc.noaa.gov/shef_archive/index.html?rfc=cnrfc&product=swe&year=2019&month=3&day=27&hour=6") %>%
html_node(".notes") %>%
html_text()
df <- tibble(txt = read_lines(text))
df %>%
mutate(
row = row_number(),
with_code = str_extract(txt, "^[A-z0-9]{5}\\s+\\d+(\\.)?\\d"),
wo_code = str_extract(txt, "^:?\\s+\\d+(\\.)?\\d") %>% str_extract("[:digit:]+\\.?[:digit:]"),
basin_desc = if_else(!is.na(with_code), lag(txt, 1), NA_character_) %>% str_sub(start = 2)
) %>%
separate(with_code, c("code", "val"), sep = "\\s+") %>%
mutate(
combined_val = case_when(
!is.na(val) ~ val,
!is.na(wo_code) ~ wo_code,
TRUE ~ NA_character_
) %>% as.numeric
) %>%
filter(!is.na(combined_val)) %>%
mutate(
code = zoo::na.locf(code),
basin_desc = zoo::na.locf(basin_desc)
) %>%
select(
code, combined_val, basin_desc
)
#> # A tibble: 643 x 3
#> code combined_val basin_desc
#> <chr> <dbl> <chr>
#> 1 ACSC1 0 San Antonio Ck - Sunol
#> 2 ADLC1 0 Arroyo De La Laguna
#> 3 ADOC1 0 Santa Ana R - Prado Dam
#> 4 AHOC1 0 Arroyo Honda nr San Jose
#> 5 AKYC1 41 SF American nr Kyburz
#> 6 AKYC1 3.2 SF American nr Kyburz
#> 7 AKYC1 42.2 SF American nr Kyburz
#> 8 ALQC1 0 Alamo Canal nr Pleasanton
#> 9 ALRC1 0 Alamitos Ck - Almaden Res
#> 10 ANDC1 0 Coyote Ck - Anderson Res
#> # ... with 633 more rows
Created on 2019-03-27 by the reprex package (v0.2.1)
This is pretty straightforward to scrape with rvest:
library(rvest)
text <- read_html("https://www.nohrsc.noaa.gov/shef_archive/index.html?rfc=cnrfc&product=swe&year=2019&month=3&day=27&hour=6") %>%
html_node(".notes") %>%
html_text()

Resources