Related
I need, with the help of the map() function, apply the above for each element
How can I do so?
As dt is of class data.table, you can make a vector of columns of interest (i.e. your items; below I use grepl on the names), and then apply your weighting function to each of those columns using .SD and .SDcols, with by
qs = names(dt)[grepl("^q", names(dt))]
dt[, (paste0(qs,"wt")):=lapply(.SD, \(q) 1/(sum(!is.na(q))/.N)),
.(sex, education_code, age), .SDcols = qs]
As mentioned in the comments, you miss a dt <- in your dt[, .(ID, education_code, age, sex, item = q1_1)] which makes the column item unavailable in the following line dt[, no_respond := is.na(item)].
Your weighting scheme is not absolutely clear to me however, assuming you want to do what is done in your code here, I would go with dplyr solution to iterate over columns.
# your data without no_respond column and correcting missing value in q2_3
dt <- data.table::data.table(
ID = c(1,2,3,4, 5, 6, 7, 8, 9, 10),
education_code = c(20,50,20,60, 20, 10,5, 12, 12, 12),
age = c(87,67,56,52, 34, 56, 67, 78, 23, 34),
sex = c("F","M","M","M", "F","M","M","M", "M","M"),
q1_1 = c(NA,1,5,3, 1, NA, 3, 4, 5,1),
q1_2 = c(NA,1,5,3, 1, 2, NA, 4, 5,1),
q1_3 = c(NA,1,5,3, 1, 2, 3, 4, 5,1),
q1_text = c(NA,1,5,3, 1, 2, 3, 4, 5,1),
q2_1 = c(NA,1,5,3, 1, 2, 3, 4, 5,1),
q2_2 = c(NA,1,5,3, 1, 2, 3, 4, 5,1),
q2_3 = c(NA,1,5,3, 1, NA, NA, 4, 5,1),
q2_text = c(NA,1,5,3, 1, NA, 3, 4, 5,1))
dt %>%
group_by(sex, education_code, age) %>% #groups the df by sex, education_code, age
add_count() %>% #add a column with number of rows in each group
mutate(across(starts_with("q"), #for each column starting with "q"
~ 1/(sum(!is.na(.))/n), #create a new column following your weight calculation
.names = '{.col}_wgt')) %>% #naming the new column with suffix "_wgt" to original name
ungroup()
I want to calculate the mean of column and and also concatenate the texts in second column output.
for example in below i want to calculate the mean of C1 and then concatenate all texts in C1T in next column if there is more than one text in C1T.
df <- data.frame(A1 = c("class","type","class","type","class","class","class","class","class"),
B1 = c("b2","b3","b3","b1","b3","b3","b3","b2","b1"),
C1=c(6, NA, 1, 6, NA, 1, 6, 6, 2),
C1T=c(NA, "Part of other business", NA, NA, NA, NA, NA, NA, NA),
C2=c(NA, 4, 1, 2, 4, 4, 3, 3, NA),
C2T=c(NA, NA, NA, NA, NA, NA, NA, NA, NA),
C3=c(3, 4, 3, 3, 6, NA, 2, 4, 1),
C3T=c(NA, NA, NA, NA, "two part are available but not in source", NA, NA, NA, NA),
C4=c(5, 5, 2, NA, NA, 6, 4, 1, 2),
C5T=c(NA, NA, NA, NA, NA, NA, NA, "Critical Expert", NA),
C5=c(6, 2, 6, 4, 2, 2, 5, 4, 1),
C5T=c(NA, NA, NA, NA, NA, "most of things are stuck", "weather responsible", NA, NA))
var <- "C1"
var1 <- "C1T"
var <- rlang::parse_expr(var)
var1 <- rlang::parse_expr(var1)
df1 <- df%>%filter(A1 == "class")
T1<- df1 %>%group_by(B1)%>%summarise(mean=round(mean(!!var,na.rm = TRUE),1))
Comments <- df1 %>% group_by(B1) %>% summarise_at(vars(var1), paste0, collapse = " ") %>%
select(var1) %>% unlist() %>% gsub("NA","",.) %>% stringi::stri_trim_both()
cbind(T1,Comments)
Edited Answer:
var <- "C1"
var1 <- "C1T"
filtercol <- "A1"
filterval <- "class"
groupingvar <- "B1"
var <- rlang::parse_expr(var)
var1 <- rlang::parse_expr(var1)
filtercol <- rlang::parse_expr(filtercol)
groupingvar <- rlang::parse_expr(groupingvar)
library(dplyr)
df1 <- df %>% filter(!!filtercol == filterval)
T1 <- df1 %>% group_by(!!groupingvar) %>% summarise(mean=round(mean(as.numeric(!!var),na.rm = TRUE),1))
Comments <- df1 %>% select(!!groupingvar, !!var1) %>%
group_by(!!groupingvar) %>%
summarise_at(vars(!!var1), paste0, collapse = " ") %>%
select(!!var1) %>% unlist() %>% gsub("NA", "", .) %>%
stringi::stri_trim_both()
T1 <- cbind(T1,Comments)
Update on OP's request (see comments):
library(dplyr)
# helper function to coalesce by column
coalesce_by_column <- function(df) {
return(coalesce(df[1], df[2]))
}
df %>%
pivot_longer(
cols = contains("T"),
names_to = "names",
values_to = "values"
) %>%
filter(names == "C1T") %>%
group_by(names) %>%
summarise(Mean = mean(c_across(C1:C5 & where(is.numeric)), na.rm = TRUE),
Comments = coalesce_by_column(values))
Output:
names Mean Comments
<chr> <dbl> <chr>
1 C1T 3.47 Part of other business
First answer
coalesce to construct Comments column
rowwise with c_across to calculate the mean rowwise.
In case you need to group, you can use ``group_by`
library(dplyr)
df %>%
mutate(Comments = coalesce(C1T, C2T, C3T, C4T, C5T),.keep="unused") %>%
rowwise() %>%
mutate(Mean = mean(c_across(C1:C5 & where(is.numeric)), na.rm = TRUE)) %>%
select(A1, B1, Mean, Comments)
Output:
A1 B1 Mean Comments
<chr> <chr> <dbl> <chr>
1 class b2 5 NA
2 type b3 3.75 Part of other business
3 class b3 2.6 NA
4 type b1 3.75 NA
5 class b3 4 two part are available but not in source
6 class b3 3.25 most of things are stuck
7 class b3 4 weather responsible
8 class b2 3.6 Critical Expert
9 class b1 1.5 NA
I have a data set similar to the following format:
Each user has only one variable that is not NA. I want to return the column name of this NA column as follows:
Writing a loop by row may easily solve this problem, but I want to user data.table to generate this variable.
With base R, it would be more efficent
df1$NonNA_VarName <- names(df1)[-1][max.col(!is.na(df1[-1]), 'first')]
df1$NonNA_VarName
#[1] "v1" "v2" "v1" "v3" "v4" "v3"
With data.table, an option is to melt into 'long' format and then extract the 'variable
library(data.table)
melt(setDT(df1), id.var = 'user', na.rm = TRUE)[,
.(NonNA_VarName = first(variable)), user][df1, on = .(user)]
Or another option is to group by 'user' and use which.max to return the index
setDT(df1)[, NonNA_VarName := names(.SD)[which.max(unlist(.SD))], by = user]
data
df1 <- structure(list(user = 1:6, v1 = c(3, NA, 2, NA, NA, NA), v2 = c(NA,
5, NA, NA, NA, NA), v3 = c(NA, NA, NA, 5, NA, 7), v4 = c(NA,
NA, NA, NA, 4, NA)), class = "data.frame", row.names = c(NA,
-6L))
I have 50+ csv files in a folder on my computer that I would like merged into 1 giant data table. Below is an example of how 3 out of my 50 tables could look (one, two, and three) and how I would like my final table to look (together).
one <- data.frame("County" = c("Autauga", "Barbour", "Bibb"), "AAAA" = c(1,
1, 1), "BBBB" = c(2, 2, 2))
two <- data.frame("County" = c("Cape May", "Mercer", "Bergen"), "BBBB" =
c(1, 1, 1), "CCCC" = c(2, 2, 2), "DDDD" = c(1, 2 ,3))
three <- data.frame("County" = c("Lincoln", "Jackson", "Pike"), "CCCC" =
c(1, 1, 1))
together <- data.frame("County" = c("Autauga", "Barbour", "Bibb", "Cape
May", "Mercer", "Bergen", "Lincoln", "Jackson", "Pike"), "AAAA" = c(1, 1, 1,
NA, NA, NA, NA, NA, NA), "BBBB" = c(2, 2, 2, 1, 1, 1, NA, NA, NA), "CCCC" =
c(NA, NA, NA, 2, 2, 2, 1, 1, 1), "DDDD" = c(NA, NA, NA, 1, 2, 3, NA, NA,
NA))
If anyone could help me with this, that would be great! Also the blanks do not need to be "NA", they can just be left as blanks.
We can use bind_rows
library(tidyverse)
bind_rows(one, two, three)
If there are many datasets, places it in a list and then use bind_rows/rbindlist from data.table
Instead of creating multiple data.table/data.frame objects in the global env, read it into a list and then use rbindlist
library(data.table)
rbindlist(lapply(files, fread))
I would like to calculate the "non-NA values interval" for different columns.
Here is the dataset:
temp <- data.frame(
date = seq(as.Date("2018-01-01"), by = 'month', length.out = 12),
X1 = c(100, NA, 23, NA, NA, 12, NA, NA, NA, NA, NA, 100),
X2 = runif(12, 50, 100),
X3 = c(24, NA, NA, NA, NA, 31, 1, NA, 44, NA, 100, NA),
X4 = NA
)
For example, X1 has non-NA intervals as 1, 2, 5, which means, from 100 to 23, there is 1 NA between these two non-NA values, from 23 to 12, there is 2 NAs between these two non-NA values, and from 12 to 100, there are 5 NAs between these two non-NA values.
The expected result is:
result <- data.frame(
X1_inv_mean = mean(c(1, 2, 5)),
X1_inv_median = median(c(1, 2, 5)),
X1_inv_sd = sd(c(1, 2, 5)),
X2_inv_mean = mean(0),
X2_inv_median = median(0),
X2_inv_sd = sd(0),
X3_inv_mean = mean(c(4, 1, 1, 1)),
X3_inv_median = median(c(4, 1, 1, 1)),
X3_inv_sd = sd(c(4, 1, 1, 1)),
X4_inv_mean = NA,
X4_inv_median = NA,
X4_inv_sd = NA
)
>result
X1_inv_mean X1_inv_median X1_inv_sd X2_inv_mean X2_inv_median X2_inv_sd X3_inv_mean X3_inv_median X3_inv_sd
1 2.666667 2 2.081666 0 0 NA 1.75 1 1.5
X4_inv_mean X4_inv_median X4_inv_sd
1 NA NA NA
Thanks for the help!
A base R option
out <- lapply(temp[-1], function(x) {
if(all(is.na(x))) {
tmp <- NA
} else {
tmp <- with(rle(is.na(x)), lengths[values])
c(mean = mean(tmp),
median = median(tmp),
sd = sd(tmp))}
})
as.data.frame(out)
# X1 X2 X3 X4
#mean 2.666667 NaN 1.75 NA
#median 2.000000 NA 1.00 NA
#sd 2.081666 NA 1.50 NA
Using rle the following line gives you the runs of NAs for each column
tmp <- with(rle(is.na(x)), lengths[values])
E.g. for column X1
with(rle(is.na(temp$X1)), lengths[values])
#[1] 1 2 5
Then we calculate your summary statistics for each tmp.
If all values in a column are NA the function returns NA.
Update:
For variable n columns:
command <- ""
summaryString <- ""
for(i in colnames(temp)){
if(i != "date"){
print(i)
summaryString <- paste(summaryString,i,"_inv_mean = mean(",i,", na.rm = T),",sep="")
summaryString <- paste(summaryString,i,"_inv_median = median(",i,", na.rm = T),",sep="")
summaryString <- paste(summaryString,i,"_inv_sd = sd(",i,", na.rm = T),",sep="")
}
command <- paste("output <- temp %>% summarise(",substr(summaryString, 0, nchar(summaryString)-1),")",sep="")
}
eval(parse(text=command))
Using dplyr:
library(dplyr)
output <- temp%>%
summarise(x1_inv_mean = mean(X1, na.rm = T),
x1_inv_median = median(X1, na.rm = T),
x1_inv_sd = sd(X1, na.rm = T),
x2_inv_mean = median(X2, na.rm = T),
x2_inv_median = mean(X2, na.rm = T),
x2_inv_sd = sd(X2, na.rm = T),
x3_inv_mean = median(X3, na.rm = T),
x3_inv_median = mean(X3, na.rm = T),
x3_inv_sd = sd(X3, na.rm = T),
x4_inv_mean = mean(X4, na.rm = T),
x4_inv_median = median(X4, na.rm = T),
x4_inv_sd = sd(X4, na.rm = T))