Table1:(there are hundreds of IDs)
participant_id hpo_term year_of_birth affected_relative genome
123 kidney failure 2000 Y 38
123 hand tremor 2000 Y 38
123 kidney transplant 2000 Y 38
432 hypertension 1980 N 37
432 exotropia 1980 N 37
432 scissor gait 1980 N 37
I have two look-up tables:(with hundreds of values in each)
Renal lookup:
kidney failure
kidney transplant
hypertension
Non-renal lookup(with hundreds of values in each):
hand tremor
exotropia
scissor gait
Desired outcome:
participant_id kidney_hpo_term non_kidney_hpo_term year_of_birth affected_relative genome
123 kidney failure;kidney transplant hand tremor 2000 Y 38
432 hypertension exotropia;scissor gait 1980 Y 37
Initially I tried:
library(dplyr); library(tidyr)
pt.data %>%
mutate(kidney = hpo_term %in% kidney.hpo) %>%
pivot_wider(names_from = kidney, values_from = hpo_term,
values_fn = function(x)paste(x,collapse = ";"), values_fill = NA) %>%
setNames(c("participant_id","Kidney","Non.kidney"))
with kidney.hpo <- read.delim("kidney_hpo_terms.txt", header = F)
But I get "Error in values_fn[[value]] ; object of type 'closure' is not subsettable"
Not sure what I am doing wrong and your help would be much appreciated.
There are several things to say with your data.
First, your table1 has duplicated columns: year_of_birth, affected_relative, and genome are the same for a given participant.
This should better be stored in a separate table, which I named table1_short.
For your very question, it is only a matter of checking whether a term is in a vector, which is done using %in%.
Here is how you could write the code:
library(tidyverse)
table1=read.table(header=T, text="
participant_id hpo_term year_of_birth affected_relative genome
123 'kidney failure' 2000 Y 38
123 'hand tremor' 2000 Y 38
123 'kidney transplant' 2000 Y 38
432 hypertension 1980 N 37
432 exotropia 1980 N 37
432 'scissor gait' 1980 N 37")
table1_short = table1 %>% select(-hpo_term) %>% group_by(participant_id) %>% slice(1)
table1_long = table1 %>% select(1:2)
renal_lookup = c("kidney failure", "kidney transplant", "hypertension")
nonrenal_lookup = c("hand tremor", "exotropia", "scissor gait")
table1_long %>%
group_by(participant_id) %>%
summarise(
kidney_hpo_term = hpo_term[hpo_term %in% renal_lookup] %>% paste(collapse=";"),
non_kidney_hpo_term = hpo_term[hpo_term %in% nonrenal_lookup] %>% paste(collapse=";")
) %>%
left_join(table1_short, by="participant_id")
#> # A tibble: 2 x 6
#> participant_id kidney_hpo_term non_kidney_hpo_term year_of_birth affected_relative genome
#> <int> <chr> <chr> <int> <chr> <int>
#> 1 123 kidney failure;kidney transplant hand tremor 2000 Y 38
#> 2 432 hypertension exotropia;scissor gait 1980 N 37
Created on 2021-05-12 by the reprex package (v2.0.0)
This can be done with dcast in data.table as follows:
dtt[, group := paste0(
ifelse(hpo_term %in% kidney_hpo, 'kidney', 'non_kidney'), '_hpo_term')]
dcast(dtt, ... ~ group, value.var = 'hpo_term',
fun.aggregate = paste, collapse = ';')
# participant_id year_of_birth affected_relative genome kidney_hpo_term
# 1: 123 2000 Y 38 kidney failure;kidney transplant
# 2: 432 1980 N 37 hypertension
# non_kidney_hpo_term
# 1: hand tremor
# 2: exotropia;scissor gait"
Related
I am working with the R programming language.
I have the following dataset:
set.seed(123)
gender <- c("Male","Female")
gender <- sample(gender, 5000, replace=TRUE, prob=c(0.45, 0.55))
gender <- as.factor(gender)
status <- c("Immigrant","Citizen")
status <- sample(status, 5000, replace=TRUE, prob=c(0.3, 0.7))
status <- as.factor(status )
country <- c("A", "B", "C", "D")
country <- sample(country, 5000, replace=TRUE, prob=c(0.25, 0.25, 0.25, 0.25))
country <- as.factor(country)
################
disease <- c("Yes","No")
disease <- sample(disease, 5000, replace=TRUE, prob=c(0.4, 0.6))
disease <- as.factor(disease)
###################
my_data = data.frame(gender, status, disease, country)
I want to find out the relative percentage of each unique group of factors that have the disease vs do not have the disease.
As an example:
What percentage of Male Immigrants from Country A have the disease vs don't have the disease
What percentage of Male Citizens from Country A have the disease vs don't have the disease (both these percentages should add to 1)
etc.
I tried to do this with the following code:
# https://stackoverflow.com/questions/24576515/relative-frequencies-proportions-with-dplyr
library(dplyr)
step_1 = my_data %>% group_by (gender, status, country, disease) %>%
summarise (n=n()) %>%
mutate(rel.freq = paste0(round(100 * n/sum(n), 0), "%"))
`summarise()` has grouped output by 'gender', 'status', 'country'. You can override using the
`.groups` argument.
# A tibble: 32 x 6
# Groups: gender, status, country [16]
gender status country disease n rel.freq
<fct> <fct> <fct> <fct> <int> <chr>
1 Female Citizen A No 285 60%
2 Female Citizen A Yes 193 40%
Now (assuming this is correct), I am trying to make some modifications to this result - this should reduce the number of rows by half (i.e. two rows in step_1 for yes and no are now combined into a single row):
step_2 = step_1 %>%
group_by(gender, status, country) %>%
summarize(disease = first(disease),
# number of people in this row that do not have the disease
n_no = sum(disease == "No"),
# number of people in this row that do have the disease
n_yes = sum(disease == "Yes"),
# relative percent of people in this row that do not have the disease
n_no_rel_freq = paste(round(sum(disease == "No") / sum(n) * 100), "%"),
# relative percent of people in this row that do have the disease
n_yes_rel_freq = paste(round(sum(disease == "Yes") / sum(n) * 100), "%"),
# overall percent of all people in this row relative to entire population
overall_percent = sum(n) / sum(step_1$n))
The code seems to run - but many of the percentages are now 0:
# A tibble: 16 x 9
# Groups: gender, status [4]
gender status country disease n_no n_yes n_no_rel_freq n_yes_rel_freq overall_percent
<fct> <fct> <fct> <fct> <int> <int> <chr> <chr> <dbl>
1 Female Citizen A No 1 0 0 % 0 % 0.102
2 Female Citizen B No 1 0 0 % 0 % 0.092
Can someone please show me how to fix this?
Thanks!
Note: The final result should look something like this
# desired result (sample)
gender status country n_no n_yes n_no_rel_freq n_yes_rel_freq total overall_percent
1 female citizen A 285 193 0.6 0.4 478 0.0956
Maybe you can make use of pivot-wider.
library(tidyverse)
my_data %>% group_by (gender, status, country, disease) %>%
summarise (n=n()) %>%
mutate(rel.freq = paste0(round(100 * n/sum(n), 0), "%")) -> step_1
#> `summarise()` has grouped output by 'gender', 'status', 'country'. You can
#> override using the `.groups` argument.
step_1 |>group_by(country) |>
pivot_wider(names_from = disease,
values_from = c(n:rel.freq),
names_prefix = "disease_") |>
mutate(overallPerc = (n_disease_No + n_disease_Yes)/sum(step_1$n))
#> # A tibble: 16 × 8
#> # Groups: country [4]
#> gender status country n_disease_No n_disease_Yes rel.fre…¹ rel.f…² overa…³
#> <fct> <fct> <fct> <int> <int> <chr> <chr> <dbl>
#> 1 Female Citizen A 308 200 61% 39% 0.102
#> 2 Female Citizen B 291 169 63% 37% 0.092
#> 3 Female Citizen C 301 228 57% 43% 0.106
#> 4 Female Citizen D 245 189 56% 44% 0.0868
#> 5 Female Immigrant A 107 95 53% 47% 0.0404
#> 6 Female Immigrant B 126 76 62% 38% 0.0404
#> 7 Female Immigrant C 137 70 66% 34% 0.0414
#> 8 Female Immigrant D 129 74 64% 36% 0.0406
#> 9 Male Citizen A 237 167 59% 41% 0.0808
#> 10 Male Citizen B 247 163 60% 40% 0.082
#> 11 Male Citizen C 250 171 59% 41% 0.0842
#> 12 Male Citizen D 230 139 62% 38% 0.0738
#> 13 Male Immigrant A 103 68 60% 40% 0.0342
#> 14 Male Immigrant B 117 63 65% 35% 0.036
#> 15 Male Immigrant C 93 53 64% 36% 0.0292
#> 16 Male Immigrant D 102 52 66% 34% 0.0308
#> # … with abbreviated variable names ¹rel.freq_disease_No,
#> # ²rel.freq_disease_Yes, ³overallPerc
I would solve this with the help of data.table:
install(data.table)
setDT(my_data)
my_data[, .N, by = .(gender, status, country, disease)][
, dcast(.SD, gender+status+country~disease, value.var = "N")][
, rel.freq := Yes/(No+Yes)][]
What is in there:
You install data.table
You convert my_data to a data.table (setDT(my_data))
With my_data[, .N, by = .(gender, status, country, disease)] you count cases (.N) grouped by all the variables after by=.
With [, dcast(.SD, gender+status+country~disease, value.var = "N")] you counvert your long table into a wide one, leaving the levels of disease as new column headers and summing on N, which is the number of cases.
With [, rel.freq := Yes/(No+Yes)] you create a new variable rel.freq that is the result of dividing the positive cases in the total cases.
With the [] you display the result to screen (you don't need this step, if you want to assign the result to a new object).
This is the result I obtained:
gender status country No Yes rel.freq
1: Female Citizen A 308 200 0.3937008
2: Female Citizen B 291 169 0.3673913
3: Female Citizen C 301 228 0.4310019
4: Female Citizen D 245 189 0.4354839
5: Female Immigrant A 107 95 0.4702970
6: Female Immigrant B 126 76 0.3762376
7: Female Immigrant C 137 70 0.3381643
8: Female Immigrant D 129 74 0.3645320
9: Male Citizen A 237 167 0.4133663
10: Male Citizen B 247 163 0.3975610
11: Male Citizen C 250 171 0.4061758
12: Male Citizen D 230 139 0.3766938
13: Male Immigrant A 103 68 0.3976608
14: Male Immigrant B 117 63 0.3500000
15: Male Immigrant C 93 53 0.3630137
16: Male Immigrant D 102 52 0.3376623
I want to group by district summing 'incoming' values at quarter and get the value of the 'stock' in the last quarter (3) in just one step. 'stock' can not summed through quarters.
My example dataframe:
library(dplyr)
df <- data.frame ("district"= rep(c("ARA", "BJI", "CMC"), each=3),
"quarter"=rep(1:3,3),
"incoming"= c(4044, 2992, 2556, 1639, 9547, 1191,2038,1942,225),
"stock"= c(19547,3160, 1533,5355,6146,355,5816,1119,333)
)
df
district quarter incoming stock
1 ARA 1 4044 19547
2 ARA 2 2992 3160
3 ARA 3 2556 1533
4 BJI 1 1639 5355
5 BJI 2 9547 6146
6 BJI 3 1191 355
7 CMC 1 2038 5816
8 CMC 2 1942 1119
9 CMC 3 225 333
The actual dataframe has ~45.000 rows and 41 variables of which 8 are of type stock.
The result should be:
# A tibble: 3 × 3
district stock incoming
<chr> <dbl> <dbl>
1 ARA 1533 9592
2 BJI 355 12377
3 CMC 333 4205
I know how to get to the result but in three steps and I don't think it's efficient and error prone due to the data.
My approach:
basea <- df %>%
group_by(district) %>%
filter(quarter==3) %>% #take only the last quarter
summarise(across(stock, sum)) %>%
baseb <- df %>%
group_by(district) %>%
summarise(across(incoming, sum)) %>%
final <- full_join(basea, baseb)
Does anyone have any suggestions to perform the procedure in one (or at least two) steps?
Grateful,
Modus
Given that the dataset only has 3 quarters and not 4. If that's not the case use nth(3) instead of last()
library(tidyverse)
df %>%
group_by(district) %>%
summarise(stock = last(stock),
incoming = sum(incoming))
# A tibble: 3 × 3
district stock incoming
<chr> <dbl> <dbl>
1 ARA 1533 9592
2 BJI 355 12377
3 CMC 333 4205
here is a data.table approach
library(data.table)
setDT(df)[, .(incoming = sum(incoming), stock = stock[.N]), by = .(district)]
district incoming stock
1: ARA 9592 1533
2: BJI 12377 355
3: CMC 4205 333
Here's a refactor that removes some of the duplicated code. This also seems like a prime use-case for creating a custom function that can be QC'd and maintained easier:
library(dplyr)
df <- data.frame ("district"= rep(c("ARA", "BJI", "CMC"), each=3),
"quarter"=rep(1:3,3),
"incoming"= c(4044, 2992, 2556, 1639, 9547, 1191,2038,1942,225),
"stock"= c(19547,3160, 1533,5355,6146,355,5816,1119,333)
)
aggregate_stocks <- function(df, n_quarter) {
base <- df %>%
group_by(district)
basea <- base %>%
filter(quarter == n_quarter) %>%
summarise(across(stock, sum))
baseb <- base %>%
summarise(across(incoming, sum))
final <- full_join(basea, baseb, by = "district")
return(final)
}
aggregate_stocks(df, 3)
#> # A tibble: 3 × 3
#> district stock incoming
#> <chr> <dbl> <dbl>
#> 1 ARA 1533 9592
#> 2 BJI 355 12377
#> 3 CMC 333 4205
Here is the same solution as #Tom Hoel but without using a function to subset, instead just use []:
library(dplyr)
df %>%
group_by(district) %>%
summarise(stock = stock[3],
incoming = sum(incoming))
district stock incoming
<chr> <dbl> <dbl>
1 ARA 1533 9592
2 BJI 355 12377
3 CMC 333 4205
I have a dataset that has two rows of data, and want to tidy them using something like gather() but don't know how to mark both as key columns.
The data looks like:
Country US Canada US
org_id 332 778 920
02-15-20 25 35 54
03-15-20 30 10 60
And I want it to look like
country org_id date purchase_price
US 332 02-15-20 25
Canada 778 02-15-20 35
US 920 02-15-20 54
US 332 03-15-20 30
Canada 778 03-15-20 10
US 920 03-15-20 60
I know gather() can move the country row to a column, for example, but is there a way to move both the country and org_id rows to columns?
It is not a good idea to have duplicate column names in the data so I'll rename one of them.
names(df)[4] <- 'US_1'
gather has been retired and replaced with pivot_longer.
This is not a traditional reshape because the data in the 1st row needs to be treated differently than rest of the rows so we can perform the reshaping separately and combine the result to get one final dataframe.
library(dplyr)
library(tidyr)
df1 <- df %>% slice(-1L) %>% pivot_longer(cols = -Country)
df %>%
slice(1L) %>%
pivot_longer(-Country, values_to = 'org_id') %>%
select(-Country) %>%
inner_join(df1, by = 'name') %>%
rename(Country = name, date = Country) -> result
result
# Country org_id date value
# <chr> <int> <chr> <int>
#1 US 332 02-15-20 25
#2 US 332 03-15-20 30
#3 Canada 778 02-15-20 35
#4 Canada 778 03-15-20 10
#5 US_1 920 02-15-20 54
#6 US_1 920 03-15-20 60
data
df <- structure(list(Country = c("org_id", "02-15-20", "03-15-20"),
US = c(332L, 25L, 30L), Canada = c(778L, 35L, 10L), US = c(920L,
54L, 60L)), class = "data.frame", row.names = c(NA, -3L))
First, we paste together Country and org_id
library(tidyverse)
data <- set_names(data, paste(names(data), data[1,], sep = "-"))
data
Country-org_id US-332 Canada-778 US-920
1 org_id 332 778 920
2 02-15-20 25 35 54
3 03-15-20 30 10 60
Then, we drop the first row, pivot the table and separate the column name.
df <- data %>%
slice(2:n()) %>%
rename(date = `Country-org_id`) %>%
pivot_longer(cols = -date, values_to = "price") %>%
separate(col = name, into = c("country", "org_id"), sep = "-")
df
# A tibble: 6 x 4
date country org_id price
<chr> <chr> <chr> <int>
1 02-15-20 US 332 25
2 02-15-20 Canada 778 35
3 02-15-20 US 920 54
4 03-15-20 US 332 30
5 03-15-20 Canada 778 10
6 03-15-20 US 920 60
How can I scrape the data and add and additional column to show the year that it is scraped?
nba_drafts <- function(year) {
url <- glue("https://www.basketball-reference.com/draft/NBA_{year}.html")
tables<-read_html(url) %>%
html_nodes("#stats") %>%
html_table() %>%
as.tibble() %>%
add_column(year = year)
write.csv(tables, year, file = "nba_draftsR.csv", na ="")
}
2000:2017 %>%
walk(function(year) {
nba_drafts(year)
})
Error: Column 1 must be named.
Checked your code, the error is happening at the step highlighted in below code.
tables<-read_html(url) %>%
html_nodes("#stats") %>%
html_table() %>%
as.tibble() %>% # error is happening at this step
Debug Step:
The reason for this error is the first three columns names are balnks(""), which you need to assign first, then only you can change to tibble or data frame.
tables<-read_html(url) %>%
html_nodes("#stats") %>%
html_table() %>%
purrr::simplify() %>%
first()
names(tables)
[1] "" "" "" "Round 1" "Round 1" "" "Totals" "Totals" "Totals" "Totals" "Totals"
[12] "Shooting" "Shooting" "Shooting" "Per Game" "Per Game" "Per Game" "Per Game" "Advanced" "Advanced" "Advanced" "Advanced"
I have added a for loop to update the names
nba_drafts <- function(year) {
url <- glue("https://www.basketball-reference.com/draft/NBA_{year}.html")
tables<-read_html(url) %>%
html_nodes("#stats") %>%
html_table() %>%
purrr::simplify() %>%
first()
oldName<-names(tables)
#updating names with col_
for(i in 1:length(oldName)){
oldName[i]<- paste0("col_",i,oldName[i])
}
names(tables)<-oldName
tables<-tables %>%
as.tibble() %>%
add_column(year = year)
return(tables)
}
Output:
> nba_drafts("2019")
# A tibble: 63 x 23
col_1 col_2 col_3 `col_4Round 1` `col_5Round 1` col_6 col_7Totals col_8Totals col_9Totals col_10Totals col_11Totals
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 Rk Pk Tm Player College Yrs G MP PTS TRB AST
2 1 1 NOP Zion Williams… Duke 1 19 565 448 129 41
3 2 2 MEM Ja Morant Murray State 1 59 1771 1041 208 409
4 3 3 NYK RJ Barrett Duke 1 56 1704 803 279 143
5 4 4 LAL De'Andre Hunt… Virginia 1 63 2018 778 286 112
6 5 5 CLE Darius Garland Vanderbilt 1 59 1824 728 111 229
Below is a subsetted dataset, I was wondering how do I go about for each set of ids, and sorted by earliest to latest date, create a new column that indicates the row before the "LTD" status? The purpose is to identify the diagnosis or row before hitting the "LTD" status for that unique id. Thanks in advance!
Dataset:
id <- c(123,123,123,123,123,321,321)
diag <- c("injury1", "injury2" , "cancer","injury4","cancer", "injury5", "cancer")
date <- as.Date(c('2008-11-1','2009-3-25','2010-3-14',"2010-10-14","2010-11-14", '2015-3-14', '2015-4-15'))
status <- (c("STD", "STD", "LTD", "STD","LTD","STD", "LTD"))
data <- data.frame(id, diag, date, status)
Result (N for no, Y for yes):
123 injury1 2008-11-01 STD N
123 injury2 2009-03-25 STD Y
123 cancer 2010-03-14 LTD NA
123 injury4 2010-10-14 STD Y
123 Cancer 2010-11-14 LTD NA
321 injury5 2015-03-14 STD Y
321 cancer 2015-04-15 LTD NA
We can convert the date to date object arrange by date , group_by id and use case_when based on conditions.
library(dplyr)
data %>%
mutate(date = as.Date(date)) %>%
arrange(date) %>%
group_by(id) %>%
mutate(result = case_when(lead(status == "LTD") ~"Y",
status == "LTD" ~ NA_character_,
TRUE~ "N"))
# id diag date status result
# <dbl> <fct> <date> <fct> <chr>
#1 123 injury1 2008-11-01 STD N
#2 123 injury2 2009-03-25 STD Y
#3 123 cancer 2010-03-14 LTD NA
#4 123 injury4 2010-10-14 STD Y
#5 123 cancer 2010-11-14 LTD NA
#6 321 injury5 2015-03-14 STD Y
#7 321 cancer 2015-04-15 LTD NA
Using by() and step-by-step assignment.
do.call(rbind, by(data[order(data$date), ], data$id, function(x) {
x$diag <- "N"
x$diag[which(x$status == "LTD") - 1] <- "Y"
x$diag[x$status == "LTD"] <- NA
return(x[c(1, 3:4, 2)])
}))
# id date status diag
# 123.1 123 2008-11-01 STD N
# 123.2 123 2009-03-25 STD Y
# 123.3 123 2010-03-14 LTD <NA>
# 123.4 123 2010-10-14 STD Y
# 123.5 123 2010-11-14 LTD <NA>
# 321.6 321 2015-03-14 STD Y
# 321.7 321 2015-04-15 LTD <NA>