calculate the mean of column and also the comments in next column - r

I want to calculate the mean of column and and also concatenate the texts in second column output.
for example in below i want to calculate the mean of C1 and then concatenate all texts in C1T in next column if there is more than one text in C1T.
df <- data.frame(A1 = c("class","type","class","type","class","class","class","class","class"),
B1 = c("b2","b3","b3","b1","b3","b3","b3","b2","b1"),
C1=c(6, NA, 1, 6, NA, 1, 6, 6, 2),
C1T=c(NA, "Part of other business", NA, NA, NA, NA, NA, NA, NA),
C2=c(NA, 4, 1, 2, 4, 4, 3, 3, NA),
C2T=c(NA, NA, NA, NA, NA, NA, NA, NA, NA),
C3=c(3, 4, 3, 3, 6, NA, 2, 4, 1),
C3T=c(NA, NA, NA, NA, "two part are available but not in source", NA, NA, NA, NA),
C4=c(5, 5, 2, NA, NA, 6, 4, 1, 2),
C5T=c(NA, NA, NA, NA, NA, NA, NA, "Critical Expert", NA),
C5=c(6, 2, 6, 4, 2, 2, 5, 4, 1),
C5T=c(NA, NA, NA, NA, NA, "most of things are stuck", "weather responsible", NA, NA))
var <- "C1"
var1 <- "C1T"
var <- rlang::parse_expr(var)
var1 <- rlang::parse_expr(var1)
df1 <- df%>%filter(A1 == "class")
T1<- df1 %>%group_by(B1)%>%summarise(mean=round(mean(!!var,na.rm = TRUE),1))
Comments <- df1 %>% group_by(B1) %>% summarise_at(vars(var1), paste0, collapse = " ") %>%
select(var1) %>% unlist() %>% gsub("NA","",.) %>% stringi::stri_trim_both()
cbind(T1,Comments)

Edited Answer:
var <- "C1"
var1 <- "C1T"
filtercol <- "A1"
filterval <- "class"
groupingvar <- "B1"
var <- rlang::parse_expr(var)
var1 <- rlang::parse_expr(var1)
filtercol <- rlang::parse_expr(filtercol)
groupingvar <- rlang::parse_expr(groupingvar)
library(dplyr)
df1 <- df %>% filter(!!filtercol == filterval)
T1 <- df1 %>% group_by(!!groupingvar) %>% summarise(mean=round(mean(as.numeric(!!var),na.rm = TRUE),1))
Comments <- df1 %>% select(!!groupingvar, !!var1) %>%
group_by(!!groupingvar) %>%
summarise_at(vars(!!var1), paste0, collapse = " ") %>%
select(!!var1) %>% unlist() %>% gsub("NA", "", .) %>%
stringi::stri_trim_both()
T1 <- cbind(T1,Comments)

Update on OP's request (see comments):
library(dplyr)
# helper function to coalesce by column
coalesce_by_column <- function(df) {
return(coalesce(df[1], df[2]))
}
df %>%
pivot_longer(
cols = contains("T"),
names_to = "names",
values_to = "values"
) %>%
filter(names == "C1T") %>%
group_by(names) %>%
summarise(Mean = mean(c_across(C1:C5 & where(is.numeric)), na.rm = TRUE),
Comments = coalesce_by_column(values))
Output:
names Mean Comments
<chr> <dbl> <chr>
1 C1T 3.47 Part of other business
First answer
coalesce to construct Comments column
rowwise with c_across to calculate the mean rowwise.
In case you need to group, you can use ``group_by`
library(dplyr)
df %>%
mutate(Comments = coalesce(C1T, C2T, C3T, C4T, C5T),.keep="unused") %>%
rowwise() %>%
mutate(Mean = mean(c_across(C1:C5 & where(is.numeric)), na.rm = TRUE)) %>%
select(A1, B1, Mean, Comments)
Output:
A1 B1 Mean Comments
<chr> <chr> <dbl> <chr>
1 class b2 5 NA
2 type b3 3.75 Part of other business
3 class b3 2.6 NA
4 type b1 3.75 NA
5 class b3 4 two part are available but not in source
6 class b3 3.25 most of things are stuck
7 class b3 4 weather responsible
8 class b2 3.6 Critical Expert
9 class b1 1.5 NA

Related

apply custom-made function to column pairs and create summary table

I have data with ratings on many parameters by two different raters; here are shown just a snippet of ratings on three same-prefix parameters (e.g. DH and DH_ptak):
df <- structure(list(DH = c(0, 1, NA, NA, 1, 1, 1, 1, 1, 1),
DH_ptak = c(0, 1, 1, 1, 1, 1, 1, 1, 1, 1),
SZ = c(1, 1, NA, NA, NA, 0, 1, 0, 1, 1),
SZ_ptak = c(1, 1, NA, NA, NA, 1, 0, NA, 1, 1),
RM = c(0, 1, 1, NA, NA, NA, 0, NA, 1, NA),
RM_ptak = c(0, 1, 1, 1, 1, NA, 0, 1, NA, 1)),
row.names = c(NA, 10L), class = "data.frame")
For each parameter I want to compare the two ratings columns. I use this function to find different ratings:
compare_fun <- function(c1, c2){
case_when(is.na(c1) & is.na(c2) ~ 0,
is.na(c1) | is.na(c2) ~ 1,
c1 != c2 ~ 1,
TRUE ~ 0)
}
I can use this function to sum the differences and compute an agreement percentage agree_pct:
library(dplyr)
df %>%
mutate(diff = compare_fun(DH, DH_ptak)) %>%
summarise(sum = sum(diff),
agree_pct = (nrow(df)-sum)/nrow(df)*100)
sum agree_pct
1 2 80
The problem is that I have multiple parameters. How can I compute for all ratings-column pairs the respective sum and agree_pct in one go, ideally, to obtain a table like this:
sum agree_pct
DH 2 80
SZ 3 70
RM 5 50
This is what I would do. It mostly involves pivoting the data a few times. First I make a column from row names so that I can use this to keep all the rows straight, then I go from wide to long with pivot_longer. I separate the column names to delineate between the two reviewers and assign them the names "grp1" and "grp2". Then I pivot_wider so that you have 2 columns, one for each reviewer. Lastly I apply your function across all the data, group by the variable of interest and summarize the data.
library(tidyverse)
df %>%
rownames_to_column("col") %>%
pivot_longer( -col) %>%
separate(name, into = c("var", "tmp"), sep = "_") %>%
mutate(grp = ifelse(is.na(tmp), "grp1", "grp2")) %>%
select(col, var, value, grp) %>%
pivot_wider(names_from = grp, values_from = value) %>%
mutate(diff = compare_fun(grp1, grp2)) %>%
group_by(var) %>%
summarise(sum = sum(diff),
agree_pct = (nrow(df)-sum)/nrow(df)*100)
#> # A tibble: 3 x 3
#> var sum agree_pct
#> <chr> <dbl> <dbl>
#> 1 DH 2 80
#> 2 RM 5 50
#> 3 SZ 3 70

Summary of N recent values

I am trying to get summary statistics (sum and max here) with most N recent values.
Starting data:
dt = data.table(id = c('a','a','a','a','b','b','b','b'),
week = c(1,2,3,4,1,2,3,4),
value = c(2, 3, 1, 0, 5, 7,3,2))
Desired result:
dt = data.table(id = c('a','a','a','a','b','b','b','b'),
week = c(1,2,3,4,1,2,3,4),
value = c(2, 3, 1, 0, 5, 7,3,2),
sum_recent2week = c(NA, NA, 5, 4, NA, NA, 12, 10),
max_recent2week = c(NA, NA, 3, 3, NA, NA, 7, 7))
With the data, I would like to have sum and max of 2 (N=2) most recent values for each row by id. 4th(sum_recent2week) and 5th (max_recent2week) columns are my desired columns
You can use rollsum and rollmax from the zoo package.
dt[, `:=`(sum_recent2week =
shift(rollsum(value, 2, align = 'left', fill = NA), 2),
max_recent2week =
shift(rollmax(value, 2, align = 'left', fill = NA), 2))
, id]
For the sum, if you're using data table version >= 1.12, you can use data.table::frollmean. The default for frollmean is fill = NA, so no need to specify that in this case.
dt[, `:=`(sum_recent2week =
shift(frollmean(value, 2, align = 'left')*2, 2),
max_recent2week =
shift(rollmax(value, 2, align = 'left', fill = NA), 2))
, id]
I'm sure it can be done in a much more elegant way, but here is one tidyverse possibility:
dt %>%
group_by(id) %>%
mutate(sum_recent2week = lag(value + lead(value), n = 2),
max_recent2week = pmax(lag(value, n = 2), lag(value, n = 1))) %>%
rowid_to_column() %>%
select(-week, -value) %>%
top_n(-2) %>%
right_join(dt %>%
rowid_to_column(), by = c("rowid" = "rowid",
"id" = "id")) %>%
select(-rowid)
id sum_recent2week max_recent2week week value
<chr> <dbl> <dbl> <dbl> <dbl>
1 a NA NA 1. 2.
2 a NA NA 2. 3.
3 a 5. 3. 3. 1.
4 a 4. 3. 4. 0.
5 b NA NA 1. 5.
6 b NA NA 2. 7.
7 b 12. 7. 3. 3.
8 b 10. 7. 4. 2.
First, it is computing the "sum_recent2week" and "max_recent2week" per group. Second, it selects the last two rows per group. Finally, it merges it with the original data.
Or if you want to compute it for all rows, not just for the last two rows per group:
dt %>%
group_by(id) %>%
mutate(sum_recent2week = lag(value + lead(value), n = 2),
max_recent2week = pmax(lag(value, n = 2), lag(value, n = 1)))

How do we create the data frame using column name,number of missing values and their percentage

Missing_Values = data.frame(colSums(is.na(train)))
Missing_Values_per = data.frame(colMeans(is.na(train))) * 100
data.frame(Column_Name = names(train))
i need to create the data frame using these three variables ,could someone help on this
try this:
library(tidyverse)
train <- tibble(a = c(NA, 1, 4, NA, NA),
b = c(6, NA, NA, NA, NA))
train %>%
gather(column_name, v) %>%
group_by(column_name) %>%
summarize(missing_values = sum(is.na(v)),
missing_values_per = mean(is.na(v)) * 100)

Multiple columns processing and dynamically naming new columns

Variables are mistakenly being entered into multiple columns eg: "aaa_1", "aaa_2" and "aaa_3", or "ccc_1, "ccc_2", and "ccc_3"). Need to create single new columns (eg "aaa", or "ccc"). Some variables are currently in a single column though ("hhh_1"), but more columns may be added (hhh_2 etc).
This is what I got:
aaa_1 <- c(43, 23, 65, NA, 45)
aaa_2 <- c(NA, NA, NA, NA, NA)
aaa_3 <- c(NA, NA, 92, NA, 82)
ccc_1 <- c("fra", NA, "spa", NA, NA)
ccc_2 <- c(NA, NA, NA, "wez", NA)
ccc_3 <- c(NA, "ija", NA, "fda", NA)
ccc_4 <- c(NA, NA, NA, NA, NA)
hhh_1 <- c(183, NA, 198, NA, 182)
dataf1 <- data.frame(aaa_1,aaa_2,aaa_3,ccc_1,ccc_2, ccc_3,ccc_4,hhh_1)
This is what I want:
aaa <- c(43, 23, NA, NA, NA)
ccc <- c("fra", "ija", "spa", NA, NA)
hhh <- c(183, NA, 198, NA, 182)
dataf2 <- data.frame(aaa,ccc,hhh)
General solution needed as there are ~100 variables (eg "aaa", "hhh", "ccc", "ttt", "eee", "hhh"etc).
Thanks!
This is a base solution, i.e. no packages.
First define get_only which when given a list converts it to a data.frame and applies get_only to each row. When given a vector it returns the single non-NA in it or NA if there is not only one.
Define root to be the column names without the suffixes.
Convert the data frame to a list of columns, group them by root and apply get_only to each such group.
Finally, convert the resulting list to a data frame.
get_only <- function(x) UseMethod("get_only")
get_only.list <- function(x) apply(data.frame(x), 1, get_only)
get_only.default <- function(x) if (sum(!is.na(x)) == 1) na.omit(x) else NA
root <- sub("_.*", "", names(dataf1))
as.data.frame(lapply(split(as.list(dataf1), root), FUN = get_only))
giving:
age country hight
1 43 fra 183
2 23 ija NA
3 NA spa 198
4 NA <NA> NA
5 NA <NA> 182
We may try with splitstackshape
library(splitstackshape)
nm1 <- sub("_\\d+", "", names(dataf1))
tbl <- table(nm1) > 1
merged.stack(dataf1, var.stubs = names(tbl)[tbl], sep="_")
I'm not sure your example is right. For example in the third row you've got values for both age_1 and age_3, then in the desired output NA for that row.
If I've understood what you're trying to do though, it will be much easier if you transpose columns to rows, fix them and then transpose back again. Try this as a start point using the 'tidyverse' of dplyr and tidyr.
library(tidyverse)
library(stringr)
age_1 <- c(43, 23, 65, NA, 45)
age_2 <- c(NA, NA, NA, NA, NA)
age_3 <- c(NA, NA, 92, NA, 82)
country_1 <- c("fra", NA, "spa", NA, NA)
country_2 <- c(NA, NA, NA, "wez", NA)
country_3 <- c(NA, "ija", NA, "fda", NA)
country_4 <- c(NA, NA, NA, NA, NA)
hight_1 <- c(183, NA, 198, NA, 182)
dataf1 <- data.frame(age_1,age_2,age_3,country_1,country_2, country_3,country_4,hight_1)
data <- dataf1 %>%
mutate(row_num = row_number()) %>% #create a row number to track values
gather(key, value, -row_num) %>% #flatten your data
drop_na() %>% #drop na rows
mutate(key = str_replace(key, "_.", "")) %>% #remove the '_x' part of names
group_by(row_num) %>%
top_n(1) %>%
spread(key, value) #pivot back to columns
For your example you need the group_by() and top_n() lines to make it run because you've got multiple values in the same row. If you only have one value (as I think you should?) then you can remove these two lines. It will be better without them because then it won't run if your data is wrong.
Edit following comment below. This will make any duplicated entries NA.
data <- dataf1 %>%
mutate(row_num = row_number()) %>% #create a row number to track values
gather(key, value, -row_num) %>% #flatten your data
drop_na() %>% #drop na rows
mutate(key = str_replace(key, "_.", "")) %>% #remove the '_x' part of names
group_by(row_num, key) %>%
mutate(count = n()) %>% #count how many entries for each row/key combo
mutate(value = ifelse(count > 1, NA, value)) %>% #set NA for rows with duplicates
drop_na() %>%
spread(key, value) %>% #pivot back to columns
select(-count) #drop the `count` variable

How to make a list with calculations based on identical values in two columns of a data frame?

I want to scan the column 'var2' for elements that appear in an other column 'var1' and, where they match, find the proportion 'value2'/'value1' depending on the levels of the column 'type'. The desired output would be a list or a data frame.
var1<- c(8, 1, 1, 2, 3, 1, 4, 5, 2, 6, 1, 3, 7)
var2 <- c(NA,8,NA,NA,NA,3,NA,NA,5,NA,6,6,NA)
value1 <- c(4340, NA, 3740, 3825, 3845, NA, 4005, 2660, NA, 3055, NA, NA, 5800)
value2 <- c(NA, 30, NA, NA, NA, 5, NA, NA, 15, NA, 1, 20, NA)
type <- c('Fish','Crab','Fish','Fish','Fish','Bird','Fish','Fish','Bird','Fish','Bird','Crab','Fish')
df <- data.frame(var1,var2,value1,value2,type)
Thanks to #Veerendra Gadekar who came up with the solution in a comment.
lapply(split(df, df$var2),
function(x)
c(x$value2/df$value1[df$var1 == unique(x$var2) & !is.na(df$value1)])
)
#$`3`
#[1] 0.00130039
#
#$`5`
#[1] 0.005639098
#
#$`6`
#[1] 0.0003273322 0.0065466448
#
#$`8`
#[1] 0.006912442

Resources