This question already has answers here:
Extracting columns having greater than certain values in R dataframe
(5 answers)
Select columns that don't contain any NA value in R
(3 answers)
Closed 2 years ago.
i have the following Dataset:
structure(list(Species = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label =
c("Bream", "Parkki", "Perch", "Pike", "Roach", "Smelt", "Whitefish"),
class = "factor"),
WeightGRAM = c(242, 290, 340, 363, 430, 450), VertLengthCM = c(23.2,
24, 23.9, 26.3, 26.5, 26.8), DiagLengthCM = c(25.4, 26.3,
26.5, 29, 29, 29.7), CrossLengthCM = c(30, 31.2, 31.1, 33.5,
34, 34.7), HeightCM = c(11.52, 12.48, 12.3778, 12.73, 12.444,
13.6024), WidthCM = c(4.02, 4.3056, 4.6961, 4.4555, 5.134,
4.9274)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))
I am trying to check for "0" or negative values in the numeric columns and remove them.
I have the following code:
fish_data <- fish_data [which(rowSums(fish_data) > 0), ]
But i will get a error message:
Error in rowSums(fish_data) : 'x' must be numeric
I roughly guess because my "species" columns are factor, this message came up.
Can i know how can i skip the first column and ask R to check for only numeric columns for "0" or negative values?
Here is a way that keeps only the columns with no values less than or equal to zero.
keep <- sapply(fish_data, function(x) {
if(is.numeric(x)) all(x > 0) else TRUE
})
fish_data[keep]
## A tibble: 6 x 7
# Species WeightGRAM VertLengthCM DiagLengthCM CrossLengthCM HeightCM WidthCM
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 Bream 242 23.2 25.4 30 11.5 4.02
#2 Bream 290 24 26.3 31.2 12.5 4.31
#3 Bream 340 23.9 26.5 31.1 12.4 4.70
#4 Bream 363 26.3 29 33.5 12.7 4.46
#5 Bream 430 26.5 29 34 12.4 5.13
#6 Bream 450 26.8 29.7 34.7 13.6 4.93
Using dplyr we can use select to select columns where all values are greater than 0 or are not numeric.
library(dplyr)
df %>% select(where(~(is.numeric(.) && all(. > 0)) || !is.numeric(.)))
# A tibble: 6 x 7
# Species WeightGRAM VertLengthCM DiagLengthCM CrossLengthCM HeightCM WidthCM
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 Bream 242 23.2 25.4 30 11.5 4.02
#2 Bream 290 24 26.3 31.2 12.5 4.31
#3 Bream 340 23.9 26.5 31.1 12.4 4.70
#4 Bream 363 26.3 29 33.5 12.7 4.46
#5 Bream 430 26.5 29 34 12.4 5.13
#6 Bream 450 26.8 29.7 34.7 13.6 4.93
In the previous version of dplyr, we can use select_if :
df %>% select_if(~(is.numeric(.) && all(. > 0)) || !is.numeric(.))
you only need to specifiy the columns for the rowSums() function:
fish_data <- fish_data[which(rowSums(fish_data[,2:7]) > 0), ]
note that rowsums sums all values across the row im not sure if thats whta you really want to achieve?
you can check the output of rowsums with:
> rowSums(fish_data[,2:7])
[1] 336.1400 388.2856 438.5739 468.9855 537.0780 559.7298
Thanks all, i think i figure out.
i should be keying:
fish_data[fish_data <= 0] <- NA #convert records with less than or equal to 0 to NA
fish_data <- na.omit(fish_data) # delete rows with NA
But i will get a warning message:
Warning message: In Ops.factor(left, right) : ‘<=’ not meaningful for
factors
# Option 1: (Safer because will retain rows containing NAs)
# Subset data.frame to not contain any observations with 0 values:
# data.frame => stdout (console)
df[rowMeans(df != 0, na.rm = TRUE) == 1,]
# Option 2: (More dangerous because it will remove all rows containing
# NAs) subset data.frame to not contain any observations with 0 values:
# data.frame => stdout (console)
df[complete.cases(replace(df, df == 0, NA)),]
# Option 3 (Variant of Option 1):
# Subset data.frame to not contain any observations with 0 values:
# data.frame => stdout (console)
df[rowMeans(Vectorize(function(x){x != 0})(df[,sapply(df, is.numeric)]),
na.rm = TRUE) == 1,]
# Option 4: Using Higher-order functions:
# Subset data.frame to not contain any observations with 0 values:
# data.frame => stdout (console)
df[Reduce(function(y, z){intersect(y, z)},
Map(function(x){which(x > 0)}, df[,sapply(df, is.numeric)])), ]
# Option 5 tidyverse:
# Subset data.frame to not contain any observations with 0 values:
# data.frame => stdout (console)
library(dplyr)
df %>%
filter_if(is.numeric, all_vars(. > 0))
Data:
df <- structure(list(Species = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label =
c("Bream", "Parkki", "Perch", "Pike", "Roach", "Smelt", "Whitefish"),
class = "factor"),
WeightGRAM = c(242, 290, 340, 363, 0, 450), VertLengthCM = c(23.2,
24, 23.9, 26.3, 26.5, 26.8), DiagLengthCM = c(25.4, 26.3,
26.5, 29, 29, 29.7), CrossLengthCM = c(30, 31.2, 31.1, 33.5,
34, 34.7), HeightCM = c(11.52, 0, 12.3778, 12.73, 12.444,
13.6024), WidthCM = c(4.02, 4.3056, 4.6961, 4.4555, 5.134,
4.9274)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))
Related
i´m currently working with a large dataframe of 75 columns and round about 9500 rows. This dataframe contains observations for every day from 1995-2019 for several observation points.
Edit: The print from dput(head(df))
> dput(head(df))
structure(list(date = structure(c(9131, 9132, 9133, 9134, 9135,
9136), class = "Date"), x1 = c(50.75, 62.625, 57.25, 56.571,
36.75, 39.125), x2 = c(62.25, 58.714, 49.875, 56.375, 43.25,
41.625), x3 = c(90.25, NA, 70.125, 75.75, 83.286, 98.5),
x4 = c(60, 72, 68.375, 65.5, 63.25, 55.875), x5 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), xn = c(53.25,
61.143, 56.571, 58.571, 36.25, 44.375), year = c(1995, 1995, 1995, 1995,
1995, 1995), month = c(1, 1, 1, 1, 1, 1), day = c(1, 2, 3,
4, 5, 6)), row.names = c(NA, -6L), class = c("tbl_df", "tbl",
"data.frame"))
The dataframe looks like this sample from it:
date x1 x2 x3 x4 x5 xn year month day
<date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1995-01-01 50.8 62.2 90.2 60 NA 53.2 1995 1 1
2 1999-08-02 62.6 58.7 NA 72 NA 61.1 1999 8 2
3 2001-09-03 57.2 49.9 70.1 68.4 NA 56.6 2001 9 3
4 2008-05-04 56.6 56.4 75.8 65.5 NA 58.6 2008 5 4
5 2012-04-05 36.8 43.2 83.3 63.2 NA 36.2 2012 4 5
6 2019-12-31 39.1 41.6 98.5 55.9 NA 44.4 2019 12 31
str(df)
tibble [9,131 x 75] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ date : Date[1:9131], format: "1995-01-01" "1995-01-02" ...
$ x1 : num [1:9131] 50.8 62.6 57.2 56.6 36.8 ...
$ x2 : num [1:9131] 62.2 58.7 49.9 56.4 43.2 ...
xn
$ year : num [1:9131] 1995 1995 1995 1995 1995 ...
$ month : num [1:9131] 1 1 1 1 1 1 1 1 1 1 ...
$ day : num [1:9131] 1 2 3 4 5 6 7 8 9 10 ...
My goal is to get for every observation point xn the count of all observations which cross a certain limit per year.
So far i tried to reach this with the Aggregate function.
To get the mean of every year i used the following command:
aggregate(list(df), by=list(year=df$year), mean, na.rm=TRUE)
this works perfect, i get the mean for every year for every observation point.
To get the sum of one station i used the following code
aggregate(list(x1=df$x1), by=list(year=df$year), function(x) sum(rle(x)$values>120, na.rm=TRUE))
which results in this print:
year x1
1 1995 52
2 1996 43
3 1997 44
4 1998 42
5 1999 38
6 2000 76
7 2001 52
8 2002 58
9 2003 110
10 2004 34
11 2005 64
12 2006 46
13 2007 46
14 2008 17
15 2009 41
16 2010 30
17 2011 40
18 2012 47
19 2013 40
20 2014 21
21 2015 56
22 2016 27
23 2017 45
24 2018 22
25 2019 45
So far, so good. I know i could expand the code by adding (..,x2=data$x2, x3=data$x3,..xn) to the list argument in code above. which i tried and they work.
But how do I get them all at once?
I tried the following codes:
aggregate(.~(date, year, month, day), by=list(year=df$year), function(x) sum(rle(x)$values>120, na.rm=TRUE))
Fehler: Unerwartete(s) ',' in "aggregate(.~(date,"
aggregate(.~date+year+month+day, by=list(year=df$year), function(x) sum(rle(x)$values>120, na.rm=TRUE))
Fehler in as.data.frame.default(data, optional = TRUE) :
cannot coerce class ‘"function"’ to a data.frame
aggregate(. ~ date + year + month + day, data = df,by=list(year=df$year), function(x) sum(rle(x)$values>120, na.rm=TRUE))
Fehler in aggregate.data.frame(lhs, mf[-1L], FUN = FUN, ...) :
Argumente müssen dieselbe Länge haben
But unfortunately none of them works. Could someone please give me a hint where my mistake is?
Here is an answer that uses base R, and since none of the data in the example data is above 120, we set a criterion of above 70.
data <- structure(
list(
date = structure(c(9131, 9132, 9133, 9134, 9135,
9136), class = "Date"),
x1 = c(50.75, 62.625, 57.25, 56.571,
36.75, 39.125),
x2 = c(62.25, 58.714, 49.875, 56.375, 43.25,
41.625),
x3 = c(90.25, NA, 70.125, 75.75, 83.286, 98.5),
x4 = c(60, 72, 68.375, 65.5, 63.25, 55.875),
x5 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
xn = c(53.25,
61.143, 56.571, 58.571, 36.25, 44.375),
year = c(1995, 1995, 1995, 1995,
1995, 1995),
month = c(1, 1, 1, 1, 1, 1),
day = c(1, 2, 3,
4, 5, 6)
),
row.names = c(NA,-6L),
class = c("tbl_df", "tbl",
"data.frame"
))
First, we create a subset of the data that contains all columns containing x, and set them to TRUE or FALSE based on whether the value is greater than 70.
theCols <- data[,colnames(data)[grepl("x",colnames(data))]]
Second, we cbind() the year onto the matrix of logical values.
x_logical <- cbind(year = data$year,as.data.frame(apply(theCols,2,function(x) x > 70)))
Finally, we use aggregate across all columns other than year and sum the columns.
aggregate(x_logical[2:ncol(x_logical)],by = list(x_logical$year),sum,na.rm=TRUE)
...and the output:
Group.1 x1 x2 x3 x4 x5 xn
1 1995 0 0 5 1 0 0
>
Note that by using colnames() to extract the columns that start with x and nrow() in the aggregate() function, we make this a general solution that will handle a varying number of x locations.
Two tidyverse solutions
A tidyverse solution to the same problem is as follows. It includes the following steps.
Use mutate() with across() to create the TRUE / FALSE versions of the x variables. Note that across() requires dplyr 1.0.0, which is currently in development but due for production release the week of May 25th.
Use pivot_longer() to allow us to summarise() multiple measures without a lot of complicated code.
Use pivot_wider() to convert the data back to one column for each x measurement.
...and the code is:
devtools::install_github("tidyverse/dplyr") # needed for across()
library(dplyr)
library(tidyr)
library(lubridate)
data %>%
mutate(.,across(starts_with("x"),~if_else(. > 70,TRUE,FALSE))) %>%
select(-year,-month,-day) %>% group_by(date) %>%
pivot_longer(starts_with("x"),names_to = "measure",values_to = "value") %>%
mutate(year = year(date)) %>% group_by(year,measure) %>%
select(-date) %>%
summarise(value = sum(value,na.rm=TRUE)) %>%
pivot_wider(id_cols = year,names_from = "measure",
values_from = value)
...and the output, which matches the Base R solution that I originally posted:
`summarise()` regrouping output by 'year' (override with `.groups` argument)
# A tibble: 1 x 7
# Groups: year [1]
year x1 x2 x3 x4 x5 xn
<dbl> <int> <int> <int> <int> <int> <int>
1 1995 0 0 5 1 0 0
>
...and here's an edited version of the other answer that will also produce the same results as above. This solution implements pivot_longer() before creating the logical variable for exceeding the threshold, so it does not require the across() function. Also note that since this uses 120 as the threshold value and none of the data meets this threshold, the sums are all 0.
df_example %>%
pivot_longer(x1:x5) %>%
mutate(greater_120 = value > 120) %>%
group_by(year,name) %>%
summarise(sum_120 = sum(greater_120,na.rm = TRUE)) %>%
pivot_wider(id_cols = year,names_from = "name", values_from = sum_120)
...and the output:
`summarise()` regrouping output by 'year' (override with `.groups` argument)
# A tibble: 1 x 6
# Groups: year [1]
year x1 x2 x3 x4 x5
<dbl> <int> <int> <int> <int> <int>
1 1995 0 0 0 0 0
>
Conclusions
As usual, there are many ways to accomplish a given task in R. Depending on one's preferences, the problem can be solved with Base R or the tidyverse. One of the quirks of the tidyverse is that some operations such as summarise() are much easier to perform on narrow format tidy data than on wide format data. Therefore, it's important to be proficient with tidyr::pivot_longer() and pivot_wider() when working in the tidyverse.
That said, with the production release of dplyr 1.0.0, the team at RStudio continues to add features that facilitate working with wide format data.
This should solve your problem
library(tidyverse)
library(lubridate)
df_example <- structure(list(date = structure(c(9131, 9132, 9133, 9134, 9135,
9136), class = "Date"), x1 = c(50.75, 62.625, 57.25, 56.571,
36.75, 39.125), x2 = c(62.25, 58.714, 49.875, 56.375, 43.25,
41.625), x3 = c(90.25, NA, 70.125, 75.75, 83.286, 98.5),
x4 = c(60, 72, 68.375, 65.5, 63.25, 55.875), x5 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), xn = c(53.25,
61.143, 56.571, 58.571, 36.25, 44.375), year = c(1995, 1995, 1995, 1995,
1995, 1995), month = c(1, 1, 1, 1, 1, 1), day = c(1, 2, 3,
4, 5, 6)), row.names = c(NA, -6L), class = c("tbl_df", "tbl",
"data.frame"))
df_example %>%
pivot_longer(x1:x5) %>%
mutate(greater_120 = value > 120) %>%
group_by(year(date)) %>%
summarise(sum_120 = sum(greater_120,na.rm = TRUE))
I would really appreciate your help in using R for data wrangling. I have a data where I want to split one column (variable) into two whenever applicable as conditioned by other variables. For example, as per the sample below, the data represents reactions time measures (RT1 and RT2) of some words (item) that appear in different times of reading (block). I want to see if RT1 and RT2 values in block 3, 4, and 5 are correlated with RT1 and RT2 values of the same item at block 1. The target items that appeared in block 1 and re-appeared in subsequent blocks are coded as 'EI' in the column 'condition', whereas items coded as 'E' or 'I' appeared only once.
dput(d1)
structure(list(RECORDING_SESSION_LABEL = c(26, 26, 26, 26, 26,
26, 26, 26), RT1 = c(5171, 3857, 3447, 314, 460, 731, 957, 1253
), RT2 = c(357, 328, 122, 39, 86, 132, 173, 215), item = c("foreign",
"detailed", "large", "foreign", "foreign", "large", "large",
"disputable"), block = c(1, 1, 1, 3, 4, 3, 4, 3), condition = c("EI",
"E", "EI", "EI", "EI", "EI", "EI", "I")), row.names = c(NA, -8L
), class = c("tbl_df", "tbl", "data.frame"))
Where a sample of the data would look like this:
> d1
# A tibble: 8 x 6
RECORDING_SESSION_LABEL RT1 RT2 item block condition
<dbl> <dbl> <dbl> <chr> <dbl> <chr>
1 26 5171 357 foreign 1 EI
2 26 3857 328 detailed 1 E
3 26 3447 122 large 1 EI
4 26 314 39 foreign 3 EI
5 26 460 86 foreign 4 EI
6 26 731 132 large 3 EI
7 26 957 173 large 4 EI
8 26 1253 215 disputable 3 I
In order to present in a format that R would understand, the target data frame I want to achieve would be similar to the one below (where the highlighted columns should be added). Rows in blanks at these columns represent items which do not appear repetitively (condition is not coded as 'EI') ; therefore, they are irrelevant and should be coded as 'NA'.
dput(d2)
structure(list(RECORDING_SESSION_LABEL = c(26, 26, 26, 26, 26,
26, 26, 26), `RT 1` = c(5171, 3857, 3447, 314, 460, 731, 957,
1253), RT2 = c(357, 328, 122, 39, 86, 132, 173, 215), item = c("foreign",
"detailed", "large", "foreign", "foreign", "large", "large",
"disputable"), block = c(1, 1, 1, 3, 4, 3, 4, 3), condition = c("EI",
"E", "EI", "EI", "EI", "EI", "EI", "I"), `RT 1_at_block1` = c(NA,
NA, NA, 5171, 5171, 3447, 3447, NA), RT2_at_block1 = c(NA, NA,
NA, 357, 357, 122, 122, NA)), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
And a sample of the data format targeted would look like this:
> d2
# A tibble: 8 x 8
RECORDING_SESSI~ `RT 1` RT2 item block condition `RT 1_at_block1`
<dbl> <dbl> <dbl> <chr> <dbl> <chr> <dbl>
1 26 5171 357 fore~ 1 EI NA
2 26 3857 328 deta~ 1 E NA
3 26 3447 122 large 1 EI NA
4 26 314 39 fore~ 3 EI 5171
5 26 460 86 fore~ 4 EI 5171
6 26 731 132 large 3 EI 3447
7 26 957 173 large 4 EI 3447
8 26 1253 215 disp~ 3 I NA
# ... with 1 more variable: RT2_at_block1 <dbl>
> head(d2)
# A tibble: 6 x 8
RECORDING_SESSION_LABEL `RT 1` RT2 item block condition `RT 1_at_block1` RT2_at_block1
<dbl> <dbl> <dbl> <chr> <dbl> <chr> <dbl> <dbl>
1 26 5171 357 foreign 1 EI NA NA
2 26 3857 328 detailed 1 E NA NA
3 26 3447 122 large 1 EI NA NA
4 26 314 39 foreign 3 EI 5171 357
5 26 460 86 foreign 4 EI 5171 357
6 26 731 132 large 3 EI 3447 122
Thanks in advance for any help.
A possible solution using dplyr:
d1 <- structure(list(RECORDING_SESSION_LABEL = c(26, 26, 26, 26, 26, 26, 26, 26),
RT1 = c(5171, 3857, 3447, 314, 460, 731, 957, 1253),
RT2 = c(357, 328, 122, 39, 86, 132, 173, 215),
item = c("foreign", "detailed", "large", "foreign", "foreign", "large", "large", "disputable"),
block = c(1, 1, 1, 3, 4, 3, 4, 3), condition = c("EI", "E", "EI", "EI", "EI", "EI", "EI", "I")),
row.names = c(NA, -8L), class = c("tbl_df", "tbl", "data.frame"))
library(dplyr)
d2 <- d1 %>%
left_join(d1 %>% filter(block == 1) %>% select(RECORDING_SESSION_LABEL, item, RT1_at_block1 = RT1)) %>%
left_join(d1 %>% filter(block == 1) %>% select(RECORDING_SESSION_LABEL, item, RT2_at_block1 = RT2))
After that, d2 looks like this:
RECORDING_SESSION_LABEL RT1 RT2 item block condition RT1_at_block1 RT2_at_block1
<dbl> <dbl> <dbl> <chr> <dbl> <chr> <dbl> <dbl>
1 26 5171 357 foreign 1 EI 5171 357
2 26 3857 328 detailed 1 E 3857 328
3 26 3447 122 large 1 EI 3447 122
4 26 314 39 foreign 3 EI 5171 357
5 26 460 86 foreign 4 EI 5171 357
6 26 731 132 large 3 EI 3447 122
Edit: Adding a mutate if you want to set the values for block 1 to NA:
d2 <- d1 %>%
left_join(d1 %>% filter(block == 1) %>% select(RECORDING_SESSION_LABEL, item, RT1_at_block1 = RT1)) %>%
left_join(d1 %>% filter(block == 1) %>% select(RECORDING_SESSION_LABEL, item, RT2_at_block1 = RT2)) %>%
mutate(RT1_at_block1 = ifelse(block == 1, NA, RT1_at_block1),
RT2_at_block1 = ifelse(block == 1, NA, RT2_at_block1))
I have a data of age and bmi (body mass index) of 66 individuals. Based on available knowledge bmi between 18.5 and 24.9 are normal. I wish to categorise as normal and abnormal bmi in R
You can use
transform(df, type = c("abnormal", "normal")[(bmi >= 18.5 & bmi <= 24.9) + 1])
# age bmi type
#1 12 15.6 abnormal
#2 10 19.2 normal
#3 31 23.3 normal
#4 56 25.0 abnormal
#5 19 18.0 abnormal
Or with simple ifelse
transform(df, type =ifelse(bmi >= 18.5 & bmi <= 24.9, "normal", "abnormal"))
We can also use between in dplyr/data.table
library(dplyr)
df %>% mutate(type = c("abnormal", "normal")[between(bmi, 18.5, 24.9) + 1])
data
df <- data.frame(age = c(12, 10, 31, 56, 19), bmi = c(15.6, 19.2, 23.3, 25, 18))
Let me dive right into a reproducible example here:
Here is the dataframe with these "possession" conditions to be met for each team:
structure(list(conferenceId = c("A10", "AAC", "ACC", "AE", "AS",
"BIG10", "BIG12", "BIGEAST", "BIGSKY", "BIGSOUTH", "BIGWEST",
"COLONIAL", "CUSA", "HORIZON", "IVY", "MAAC", "MAC", "MEAC",
"MVC", "MWC", "NE", "OVC", "PAC12", "PATRIOT", "SEC", "SOUTHERN",
"SOUTHLAND", "SUMMIT", "SUNBELT", "SWAC", "WAC", "WCC"), values = c(25.5,
33.625, 57.65, 16, 20.9, 48.55, 63.9, 45, 17.95, 28, 11, 24.4,
23.45, 10.5, 16, 12.275, 31.5, 10.95, 21.425, 36.8999999999999,
31.025, 18.1, 23.7, 19.675, 52.9999999999997, 24.5, 15, 27.5,
12.6, 17.75, 13, 33)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -32L))
> head(poss_quantiles)
# A tibble: 6 x 2
conferenceId values
<chr> <dbl>
1 A10 25.5
2 AAC 33.6
3 ACC 57.6
4 AE 16
5 AS 20.9
6 BIG10 48.5
My main dataframe looks as followed:
> head(stats_df)
# A tibble: 6 x 8
season teamId teamName teamMarket conferenceName conferenceId possessions games
<chr> <chr> <chr> <chr> <chr> <chr> <dbl> <int>
1 1819 AFA Falcons Air Force Mountain West MWC 75 2
2 1819 AKR Zips Akron Mid-American MAC 46 3
3 1819 ALA Crimson Tide Alabama Southeastern SEC 90.5 6
4 1819 ARK Razorbacks Arkansas Southeastern SEC 71.5 5
5 1819 ARK Razorbacks Arkansas Southeastern SEC 42.5 5
6 1819 ASU Sun Devils Arizona State Pacific 12 PAC12 91.5 7e: 6 x 8
> dim(stats_df)
[1] 6426 500
I need to filter the main dataframe stats_df so that each conference's possessions is greater than their respective possession value in the poss_quantiles dataframe. I am struggling to figure out the best way to do this w/ dplyr.
I believe the following is what the question asks for.
I have made up a dataset to test the code. Posted at the end.
library(dplyr)
stats_df %>%
inner_join(poss_quantiles) %>%
filter(possessions > values) %>%
select(-values) %>%
left_join(stats_df)
# conferenceId possessions otherCol oneMoreCol
#1 s 119.63695 -1.2519859 1.3853352
#2 d 82.68660 -0.4968500 0.1954866
#3 b 103.58936 -1.0149620 0.9405918
#4 o 139.69607 -0.1623095 0.4832004
#5 q 76.06736 0.5630558 0.1319336
#6 x 86.19777 -0.7733534 2.3939706
#7 p 135.80127 -1.1578085 0.2037951
#8 t 136.05944 1.7770844 0.5145781
Data creation code.
set.seed(1234)
poss_quantiles <- data.frame(conferenceId = letters[sample(26, 20)],
values = runif(20, 50, 100),
stringsAsFactors = FALSE)
stats_df <- data.frame(conferenceId = letters[sample(26, 20)],
possessions = runif(20, 10, 150),
otherCol = rnorm(20),
oneMoreCol = rexp(20),
stringsAsFactors = FALSE)
I have a dataframe PatientA
Height Weight Age BMI
<dbl> <dbl> <dbl> <dbl>
1 161 72.2 27 27.9
2 164 61.0 21 22.8
3 171 72.0 30 24.6
4 169. 63.9 25 22.9
5 174. 64.4 27 21.1
6 160 50.9 22 19.9
7 172 77.5 22 26.3
8 165 54.5 22 20
9 173 82.4 29 27.5
10 169 76.6 22 26.9
and I would like to get some statistics for each column. I have the next working code which deals only with quantiles
genStat <- PatientsA %>%
summarise_all(funs(list(quantile(., probs = c(0.25, 0.5, 0.75))))) %>%
unnest %>%
transpose %>%
setNames(., c('25%', '50%', '75%')) %>%
map_df(unlist) %>%
bind_cols(data.frame(vars = names(PatientsA)), .)
and I need to add mean and sd to summarise_all like this
genStat <- PatientsA %>%
summarise_all(funs(mean,sd,list(quantile(., probs = c(0.25, 0.5, 0.75))))) %>%
unnest %>%
transpose %>%
setNames(., c('mean','sd','25%', '50%', '75%')) %>%
map_df(unlist) %>%
bind_cols(data.frame(vars = names(PatientsA)), .)
This straightforward approach fails returning the next error:
Error in names(object) <- nm : 'names' attribute [5] must be the
same length as the vector [3]
I'm a newbie in R, so what is the right syntax for completing this task?
This is what I would suggest. There is a little repetition in the code (calling quantile three times) but overall I think it is easier to understand and debug.
library(tidyverse)
PatientsA %>%
gather("variable", "value") %>%
group_by(variable) %>%
summarize(mean_val = mean(value),
sd_val = sd(value),
q25 = quantile(value, probs = .25),
q50 = quantile(value, probs = .5),
q75 = quantile(value, probs = .75))
## A tibble: 4 x 6
# variable mean_val sd_val q25 q50 q75
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 Age 24.7 3.33 22 23.5 27
#2 BMI 24.0 3.08 21.5 23.8 26.7
#3 Height 168. 5.01 164. 169 172.
#4 Weight 67.5 10.3 61.7 68.2 75.5
We could also place the quantile output in a list and then unnest
library(tidyverse)
PatientsA %>%
gather %>%
group_by(key) %>%
summarise_at(vars('value'),
funs(mean,
sd,
quantile = list(as.tibble(as.list(quantile(.,
probs = c(0.25, 0.5, 0.75))))))) %>%
unnest
# A tibble: 4 x 6
# key mean sd `25%` `50%` `75%`
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 Age 24.7 3.33 22 23.5 27
#2 BMI 24.0 3.08 21.5 23.8 26.7
#3 Height 168. 5.01 164. 169 172.
#4 Weight 67.5 10.3 61.7 68.2 75.5
Or using pivot_longer
PatientsA %>%
pivot_longer(cols = everything()) %>%
group_by(name) %>%
summarise(across(value, list(mean= ~ mean(., na.rm = TRUE),
sd = ~ sd(., na.rm = TRUE),
quantile = ~ list(as_tibble(as.list(quantile(.,
probs = c(0.25, 0.5, 0.75)))))))) %>%
unnest(c(value_quantile))
# A tibble: 4 x 6
name value_mean value_sd `25%` `50%` `75%`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Age 24.7 3.33 22 23.5 27
2 BMI 24.0 3.08 21.5 23.8 26.7
3 Height 168. 5.01 164. 169 172.
4 Weight 67.5 10.3 61.7 68.2 75.5
###data
PatientsA <- structure(list(Height = c(161, 164, 171, 169, 174, 160, 172,
165, 173, 169), Weight = c(72.2, 61, 72, 63.9, 64.4, 50.9, 77.5,
54.5, 82.4, 76.6), Age = c(27L, 21L, 30L, 25L, 27L, 22L, 22L,
22L, 29L, 22L), BMI = c(27.9, 22.8, 24.6, 22.9, 21.1, 19.9, 26.3,
20, 27.5, 26.9)), class = "data.frame", row.names = c("1", "2",
"3", "4", "5", "6", "7", "8", "9", "10"))