I have the following dataframe with 6 columns and several thousand rows.
Example:
Screenshot of example data
Each column represents a different timepoint 0,1,3,6,9,12. I want to calculate the area under the curve for each row of values.
For example for row 1, I would use the following function from the DescTools package
x=c(0,1,3,6,9,12)
y=c(130, 125, 120, 115, 108, 115)
AUC(x, y, method = c("linear"), na.rm=FALSE)
Is there a way to create a new variable which is the AUC for each row from my dataframe?
Thanks!
We can use apply with MARGIN = 1 to do rowwise
library(DescTools)
i1 <- rowSums(!is.na(df1)) >2
df1$AUC[i1] <- apply(df1[i1,], 1, FUN = function(y)
AUC(x, y, method = "linear", na.rm = FALSE))
df1$AUC[i1]
[1] 1394 1518
data
df1 <- structure(list(col1 = c(130, 140), col2 = c(125, 137), col3 = c(120,
125), col4 = c(115, 120), col5 = c(108, 125), col6 = c(115, 130
)), class = "data.frame", row.names = c(NA, -2L))
x <- c(0,1,3,6,9,12)
Related
Using a dataframe with missing values:
structure(list(id = c("id1", "test", "rew", "ewt"), total_frq_1 = c(54, 87, 10, 36), total_frq_2 = c(45, 24, 202, 43), total_frq_3 = c(24, NA, 25, 8), total_frq_4 = c(36, NA, 104, NA)), row.names = c(NA, 4L), class = "data.frame")
How is is possible to create a bar plot with the mean for every column, excluding the id column, but without filling the missing values with 0 but leaving out the row with missing values example for total_frq_3 24+25+8 = 57/3 = 19
You can use colMeans function and pass it the appropriate argument to ignore NA.
library(ggplot2)
xy <- structure(list(id = c("id1", "test", "rew", "ewt"),
total_frq_1 = c(54, 87, 10, 36), total_frq_2 = c(45, 24, 202, 43), total_frq_3 = c(24, NA, 25, 8),
total_frq_4 = c(36, NA, 104, NA)),
row.names = c(NA, 4L),
class = "data.frame")
xy.means <- colMeans(x = xy[, 2:ncol(xy)], na.rm = TRUE)
xy.means <- as.data.frame(xy.means)
xy.means$total <- rownames(xy.means)
ggplot(xy.means, aes(x = total, y = xy.means)) +
theme_bw() +
geom_col()
Or just use base image graphic
barplot(height = colMeans(x = xy[, 2:ncol(xy)], na.rm = TRUE))
I am using the data found here: https://www.kaggle.com/cdc/behavioral-risk-factor-surveillance-system. In my R studio, I have named the csv file, BRFSS2015. Below is the code I am trying to execute. I have created two new columns comparing people who have arthritis vs. people who do not have arthritis (arth and no_arth). Grouping by these variables, I am now trying to find the mean and sd for their weights. The weight variable was generated from another variable in the dataset using this code: (weight = BRFSS2015$WEIGHT2) Below is the code I am trying to run for mean and sd.
BRFSS2015%>%
group_by(arth,no_arth)%>%
summarize(mean_weight=mean(weight),
sd_weight=sd(weight))
I am getting output that says mean and sd for these two groups is identical. I doubt this is correct. Can someone check and tell me why this is happening? The numbers I am getting are:
arth: mean = 733.2044; sd= 2197.377
no_arth: mean= 733.2044; sd= 2197.377
Here is how I created the variables arth and no_arth:
a=BRFSS2015%>%
select(HAVARTH3)%>%
filter(HAVARTH3=="1")
b=BRFSS2015%>%
select(HAVARTH3)%>%
filter(HAVARTH3=="2")
as.data.frame(BRFSS2015)
arth=c(a)
no_arth=c(b)
BRFSS2015$arth <- c(arth, rep(NA, nrow(BRFSS2015)-length(arth)))
BRFSS2015$no_arth <- c(no_arth, rep(NA, nrow(BRFSS2015)-length(no_arth)))
as.tibble(BRFSS2015)
Before I started, I also removed NAs from weight using weight=na.omit(WEIGHT2)
Based on the info you provided one can only guess what when wrong in your analysis. But here is a working code using a snippet of the real data.
library(tidyverse)
BRFSS2015_minimal <- structure(list(HAVARTH3 = c(
1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2,
1, 1, 1, 1, 1, 1, 2, 1, 2
), WEIGHT2 = c(
280, 165, 158, 180, 142,
145, 148, 179, 84, 161, 175, 150, 9999, 140, 170, 128, 200, 178,
155, 163
)), row.names = c(NA, -20L), class = c(
"tbl_df", "tbl",
"data.frame"
))
BRFSS2015_minimal %>%
filter(!is.na(WEIGHT2), HAVARTH3 %in% 1:2) %>%
mutate(arth = HAVARTH3 == 1, no_arth = HAVARTH3 == 2,weight = WEIGHT2) %>%
group_by(arth, no_arth) %>%
summarize(
mean_weight = mean(weight),
sd_weight = sd(weight),
.groups = "drop"
)
#> # A tibble: 2 × 4
#> arth no_arth mean_weight sd_weight
#> <lgl> <lgl> <dbl> <dbl>
#> 1 FALSE TRUE 165 10.8
#> 2 TRUE FALSE 865 2629.
Code used to create dataset
BRFSS2015 <- readr::read_csv("2015.csv")
BRFSS2015_minimal <- dput(head(BRFSS2015[c("HAVARTH3", "WEIGHT2")], 20))
I came up with the idea to represent stats on a chart like this. Example of the plot. And made it like this.
df_n <- df_normalized %>%
transmute(
Height_x = round(Height*cos_my(45), 2),
Height_y = round(Height*sin_my(45), 2),
Weight_x = round(Weight*cos_my(45*2), 2),
Weight_y = round(Weight*sin_my(45*2), 2),
Reach_x = round(Reach*cos_my(45*3), 2),
Reach_y = round(Reach*sin_my(45*3), 2),
SLpM_x = round(SLpM*cos_my(45*4), 2),
SLpM_y = round(SLpM*sin_my(45*4), 2),
Str_Def_x = round(`Str_Def %`*cos_my(45*5), 2),
Str_Def_y = round(`Str_Def %`*sin_my(45*5), 2),
TD_Avg_x = round(TD_Avg*cos_my(45*6), 2),
TD_Avg_y = round(TD_Avg*sin_my(45*6), 2),
TD_Acc_x = round(`TD_Acc %`*cos_my(45*7), 2),
TD_Acc_y = round(`TD_Acc %`*sin_my(45*7), 2),
Sub_Avg_x = round(Sub_Avg*cos_my(45*8), 2),
Sub_Avg_y = round(Sub_Avg*sin_my(45*8), 2))
Now I want to do this smart way, so I created a data frame with same number of rows empty_df, and later in for loop I try to mutate and array, with every iteration. So for example I want to multiply 1st column by cos(30), 2nd by cos(30*2), and so on
But...
It mutate only last column because all columns during iteration have the same name 'column'.
I want to name each column by the variable column, made with paste0().
reprex_df <- structure(list(Height = c(190, 180, 183, 196, 185),
Weight = c(120, 77, 93, 120, 84),
Reach = c(193, 180, 188, 203, 193),
SLpM = c(2.45, 3.8, 2.05, 7.09, 3.17),
`Str_Def %` = c(58, 56, 55, 34, 44),
TD_Avg = c(1.23, 0.33, 0.64, 0.91, 0),
`TD_Acc %` = c(24, 50, 20, 66, 0),
Sub_Avg = c(0.2, 0, 0, 0, 0)), row.names = c(NA, -5L),
class = c("tbl_df", "tbl", "data.frame"))
temp <- apply(reprex_df[,1], function(x) x*cos(60), MARGIN = 2)
temp
empty_df <- data.frame(first_column = replicate(length(temp),1))
for (x in 1:8) {
temp <- apply(df[,x], function(x) round(x*cos((360/8)*x),2), MARGIN = 2)
column <- paste0("Column_",x)
empty_df <- mutate(empty_df, column = temp)
}
Later I want to make it a function where I can pass data frame and receive data frame with X, and Y coordinates.
So, how should I make it?
Perhaps this helps
library(purrr)
library(stringr)
nm1 <- names(reprex_df)
nm_cos <- str_c(names(reprex_df), "_x")
nm_sin <- str_c(names(reprex_df), "_y")
reprex_df[nm_cos] <- map2(reprex_df, seq_along(nm1),
~ round(.x * cos(45 *.y ), 2))
reprex_df[nm_sin] <- map2(reprex_df[nm1], seq_along(nm1),
~ round(.x * sin(45 *.y ), 2))
Does anyone know if it is possible to calculate a weighted mean in R when values are missing, and when values are missing, the weights for the existing values are scaled upward proportionately?
To convey this clearly, I created a hypothetical scenario. This describes the root of the question, where the scalar needs to be adjusted for each row, depending on which values are missing.
Image: Weighted Mean Calculation
File: Weighted Mean Calculation in Excel
Using weighted.mean from the base stats package with the argument na.rm = TRUE should get you the result you need. Here is a tidyverse way this could be done:
library(tidyverse)
scores <- tribble(
~student, ~test1, ~test2, ~test3,
"Mark", 90, 91, 92,
"Mike", NA, 79, 98,
"Nick", 81, NA, 83)
weights <- tribble(
~test, ~weight,
"test1", 0.2,
"test2", 0.4,
"test3", 0.4)
scores %>%
gather(test, score, -student) %>%
left_join(weights, by = "test") %>%
group_by(student) %>%
summarise(result = weighted.mean(score, weight, na.rm = TRUE))
#> # A tibble: 3 x 2
#> student result
#> <chr> <dbl>
#> 1 Mark 91.20000
#> 2 Mike 88.50000
#> 3 Nick 82.33333
The best way to post an example dataset is to use dput(head(dat, 20)), where dat is the name of a dataset. Graphic images are a really bad choice for that.
DATA.
dat <-
structure(list(Test1 = c(90, NA, 81), Test2 = c(91, 79, NA),
Test3 = c(92, 98, 83)), .Names = c("Test1", "Test2", "Test3"
), row.names = c("Mark", "Mike", "Nick"), class = "data.frame")
w <-
structure(list(Test1 = c(18, NA, 27), Test2 = c(36.4, 39.5, NA
), Test3 = c(36.8, 49, 55.3)), .Names = c("Test1", "Test2", "Test3"
), row.names = c("Mark", "Mike", "Nick"), class = "data.frame")
CODE.
You can use function weighted.mean in base package statsand sapply for this. Note that if your datasets of notes and weights are R objects of class matrix you will not need unlist.
sapply(seq_len(nrow(dat)), function(i){
weighted.mean(unlist(dat[i,]), unlist(w[i, ]), na.rm = TRUE)
})
I have a data.frame P1 (5000rows x 4cols) and would like to save the subset of data in columns 2,3 and 4 when the time-stamp in column 1 falls into a set range determined by a vector TimeStamp (in seconds).
E.g. put all values in columns 2, 3, and 4 into a new data.frame and call each section of data: Condition.1.P1, Condition.2.P1, etc.
The reason I'd like to label separately as I have 35 versions of P1 (P2, P3, P33, etc) and need to be able to melt them together to plot them.
dput(TimeStamp)
c(18, 138, 438, 678, 798, 1278, 1578, 1878, 2178)
dput(head(P1))
structure(list(Time = c(0, 5, 100, 200, 500, 1200), SkinTemp = c(27.781,
27.78, 27.779, 27.779, 27.778, 27.777), HeartRate = c(70, 70,
70, 70, 70, 70), RespirationRate = c(10, 10, 10, 10, 10, 10)), .Names = c("Time",
"SkinTemp", "HeartRate", "RespirationRate"), row.names = c(NA,
6L), class = "data.frame")
Do you want to seperate the data by the timestamp range and put it in a list? Than this might be what you are looking for:
TimeStamp <- c(18, 138, 438, 678, 798, 1278, 1578, 1878, 2178)
dat <- structure(list(Time = c(0, 5, 100, 200, 500, 1200), SkinTemp =(27.781,
27.78, 27.779, 27.779, 27.778, 27.777), HeartRate = c(70, 70,
70, 70, 70, 70), RespirationRate = c(10, 10, 10, 10, 10, 10)), .Names = c ("Time",
"SkinTemp", "HeartRate", "RespirationRate"), row.names = c(NA,
6L), class = "data.frame")
dat$Segment <- cut(dat$Time,c(-Inf,TimeStamp))
split(dat,dat$Segment)
P2 = data.frame(NA, NA, NA, NA) # Create empty data.frame
for (i in 1:length(ts)){
P3 = data.frame() # Create empty changing data.frame
if (i == 1) {ts1 = 0} else {ts1 = ts[i-1]} #First time stamp starts at 0
ts2 = ts[i]
P3 = subset(P1, P1$Time > ts1 & P1$Time < ts2)[,c(2,3,4)] #Subset the columns and assign to P3
if (nrow(P3) == 0){P3 = data.frame(NA, NA, NA)} #If the subset is empty, assign NA
P3$TimeStamp = paste(ts1,ts2,sep="-") # Append TimeStamp to the P3
colnames(P3) = colnames(P2) #Make sure column names are same to allow rbind
P2 = rbind(P2,P3) #Append P3 to P2
}
P2 = P2[c(2:nrow(P2)),] #Remove the first row (that has NA)
colnames(P2) = c("SkinTemp", "HeartRate", "RespirationRate", "TimeStamp") #Provide column names)
rm(P3); rm(i); rm(ts1); rm(ts2) #Cleanup